From: Igor Breger Date: Sun, 27 Dec 2015 13:56:16 +0000 (+0000) Subject: AVX512: Change VPMOVB2M DAG lowering , use CVT2MASK node instead TRUNCATE. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=3f202fdf9eea174a8be886b51074814289a822df;p=oota-llvm.git AVX512: Change VPMOVB2M DAG lowering , use CVT2MASK node instead TRUNCATE. Fix TRUNCATE lowering vector to vector i1, use LSB and not MSB. Implement VPMOVB/W/D/Q2M intrinsic. Differential Revision: http://reviews.llvm.org/D15675 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256470 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index cd549600325..08005844cea 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -4084,17 +4084,45 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_cvtsi2ss32 : GCCBuiltin<"__builtin_ia32_cvtsi2ss32">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, - llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, + llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_cvtsi2ss64 : GCCBuiltin<"__builtin_ia32_cvtsi2ss64">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, - llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, + llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_cvtsi2sd32 : GCCBuiltin<"__builtin_ia32_cvtsi2sd32">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, - llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, + llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_cvtsi2sd64 : GCCBuiltin<"__builtin_ia32_cvtsi2sd64">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, - llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, + llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_cvtb2mask_128 : GCCBuiltin<"__builtin_ia32_cvtb2mask128">, + Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_avx512_cvtb2mask_256 : GCCBuiltin<"__builtin_ia32_cvtb2mask256">, + Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx512_cvtb2mask_512 : GCCBuiltin<"__builtin_ia32_cvtb2mask512">, + Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty], [IntrNoMem]>; + + def int_x86_avx512_cvtw2mask_128 : GCCBuiltin<"__builtin_ia32_cvtw2mask128">, + Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty], [IntrNoMem]>; + def int_x86_avx512_cvtw2mask_256 : GCCBuiltin<"__builtin_ia32_cvtw2mask256">, + Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx512_cvtw2mask_512 : GCCBuiltin<"__builtin_ia32_cvtw2mask512">, + Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty], [IntrNoMem]>; + + def int_x86_avx512_cvtd2mask_128 : GCCBuiltin<"__builtin_ia32_cvtd2mask128">, + Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty], [IntrNoMem]>; + def int_x86_avx512_cvtd2mask_256 : GCCBuiltin<"__builtin_ia32_cvtd2mask256">, + Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty], [IntrNoMem]>; + def int_x86_avx512_cvtd2mask_512 : GCCBuiltin<"__builtin_ia32_cvtd2mask512">, + Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty], [IntrNoMem]>; + + def int_x86_avx512_cvtq2mask_128 : GCCBuiltin<"__builtin_ia32_cvtq2mask128">, + Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty], [IntrNoMem]>; + def int_x86_avx512_cvtq2mask_256 : GCCBuiltin<"__builtin_ia32_cvtq2mask256">, + Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty], [IntrNoMem]>; + def int_x86_avx512_cvtq2mask_512 : GCCBuiltin<"__builtin_ia32_cvtq2mask512">, + Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_cvtmask2b_128 : GCCBuiltin<"__builtin_ia32_cvtmask2b128">, Intrinsic<[llvm_v16i8_ty], [llvm_i16_ty], [IntrNoMem]>; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7719bef94e6..0d7258dd2a7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13365,6 +13365,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + + // Shift LSB to MSB and use VPMOVB2M - SKX. + unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to dword + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + + // Shift LSB to MSB, extend if necessary and use TESTM. + unsigned NumElts = InVT.getVectorNumElements(); + if (InVT.getSizeInBits() < 512 && + (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || + !Subtarget->hasVLX())) { + assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); + + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); +} + SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -13382,39 +13438,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // move vector to mask - truncate solution for SKX - if (VT.getVectorElementType() == MVT::i1) { - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) - return Op; // legal, will go to VPMOVD2M, VPMOVQ2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVQ2M - } - - if (VT.getVectorElementType() == MVT::i1) { - assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - unsigned NumElts = InVT.getVectorNumElements(); - assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); - if (InVT.getSizeInBits() < 512) { - MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - } - - SDValue OneV = - DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); - SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); - return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); - } + if (VT.getVectorElementType() == MVT::i1) + return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget->hasAVX512()) { @@ -16821,7 +16846,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget } case BROADCASTM: { SDValue Mask = Op.getOperand(1); - MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } @@ -16845,6 +16871,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src2, Src1); return DAG.getBitcast(VT, Res); } + case CONVERT_TO_MASK: { + MVT SrcVT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, + Op.getOperand(1)); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CvtMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } case CONVERT_MASK_TO_VEC: { SDValue Mask = Op.getOperand(1); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); @@ -20612,6 +20650,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 50886550fde..00e83a3c652 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -304,6 +304,9 @@ namespace llvm { // Vector signed/unsigned integer to double. CVTDQ2PD, CVTUDQ2PD, + // Convert a vector to mask, set bits base on MSB. + CVT2MASK, + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 8763ef8c90b..03d610313eb 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6619,7 +6619,7 @@ defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; multiclass convert_vector_to_mask_common opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I, EVEX; + [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; } multiclass avx512_convert_vector_to_mask opc, string OpcodeStr, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 6ec2396c3ba..829cedd55fb 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -581,6 +581,8 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", SDTCisOpSmallerThanOp<0, 1>, SDTCisVT<2, i32>]>>; +def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index c06b1ce38bf..5806985b5ce 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -30,7 +30,7 @@ enum IntrinsicType { COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, - TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; struct IntrinsicData { @@ -324,6 +324,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), @@ -336,6 +342,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), @@ -351,6 +360,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index edb6bef1a4a..a61aeba5aff 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -1,55 +1,167 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL_X32 -; KNL-LABEL: test1 -; KNL: vxorps define <16 x i1> @test1() { +; ALL_X64-LABEL: test1: +; ALL_X64: ## BB#0: +; ALL_X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ALL_X64-NEXT: retq +; +; KNL_X32-LABEL: test1: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL_X32-NEXT: retl ret <16 x i1> zeroinitializer } -; SKX-LABEL: test2 -; SKX: vpmovb2m -; SKX: vpmovb2m -; SKX: kandw -; SKX: vpmovm2b -; KNL-LABEL: test2 -; KNL: vpmovsxbd -; KNL: vpmovsxbd -; KNL: vpandd -; KNL: vpmovdb define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { +; KNL-LABEL: test2: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test2: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 +; SKX-NEXT: vpmovb2m %xmm1, %k0 +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: kandw %k0, %k1, %k0 +; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test2: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_X32-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL_X32-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} +; KNL_X32-NEXT: vpbroadcastd LCPI1_0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 +; KNL_X32-NEXT: retl %c = and <16 x i1>%a, %b ret <16 x i1> %c } -; SKX-LABEL: test3 -; SKX: vpmovw2m -; SKX: vpmovw2m -; SKX: kandb -; SKX: vpmovm2w define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { +; KNL-LABEL: test3: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test3: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k0 +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: kandb %k0, %k1, %k0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test3: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL_X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,0,63,0,63,0,63,0,63,0,63,0,63,0,63,0] +; KNL_X32-NEXT: vpsllvq %zmm2, %zmm1, %zmm1 +; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL_X32-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 +; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} +; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0 +; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 +; KNL_X32-NEXT: retl %c = and <8 x i1>%a, %b ret <8 x i1> %c } -; SKX-LABEL: test4 -; SKX: vpmovd2m -; SKX: vpmovd2m -; SKX: kandw -; SKX: vpmovm2d define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) { +; KNL-LABEL: test4: +; KNL: ## BB#0: +; KNL-NEXT: vandps %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test4: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: vpslld $31, %xmm1, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test4: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: vandps %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT: retl %c = and <4 x i1>%a, %b ret <4 x i1> %c } -; SKX-LABEL: test5 -; SKX: vpcmpgtd -; SKX: vpmovm2w -; SKX: call -; SKX: vpmovzxwd declare <8 x i1> @func8xi1(<8 x i1> %a) + define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) { +; KNL-LABEL: test5: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: Ltmp0: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: callq _func8xi1 +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vpslld $31, %ymm0, %ymm0 +; KNL-NEXT: vpsrad $31, %ymm0, %ymm0 +; KNL-NEXT: popq %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test5: +; SKX: ## BB#0: +; SKX-NEXT: pushq %rax +; SKX-NEXT: Ltmp0: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: callq _func8xi1 +; SKX-NEXT: vpmovzxwd %xmm0, %ymm0 +; SKX-NEXT: vpslld $31, %ymm0, %ymm0 +; SKX-NEXT: vpsrad $31, %ymm0, %ymm0 +; SKX-NEXT: popq %rax +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test5: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: subl $12, %esp +; KNL_X32-NEXT: Ltmp0: +; KNL_X32-NEXT: .cfi_def_cfa_offset 16 +; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 +; KNL_X32-NEXT: calll L_func8xi1$stub +; KNL_X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL_X32-NEXT: vpslld $31, %ymm0, %ymm0 +; KNL_X32-NEXT: vpsrad $31, %ymm0, %ymm0 +; KNL_X32-NEXT: addl $12, %esp +; KNL_X32-NEXT: retl %cmpRes = icmp sgt <8 x i32>%a, %b %resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes) %res = sext <8 x i1>%resi to <8 x i32> @@ -58,14 +170,50 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) { declare <16 x i1> @func16xi1(<16 x i1> %a) -; KNL-LABEL: test6 -; KNL: vpbroadcastd -; KNL: vpmovdb -; KNL: call -; KNL: vpmovzxbd -; KNL: vpslld $31, %zmm -; KNL: vpsrad $31, %zmm define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { +; KNL-LABEL: test6: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: Ltmp1: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: callq _func16xi1 +; KNL-NEXT: vpmovzxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vpsrad $31, %zmm0, %zmm0 +; KNL-NEXT: popq %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test6: +; SKX: ## BB#0: +; SKX-NEXT: pushq %rax +; SKX-NEXT: Ltmp1: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: callq _func16xi1 +; SKX-NEXT: vpmovzxbd %xmm0, %zmm0 +; SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; SKX-NEXT: vpsrad $31, %zmm0, %zmm0 +; SKX-NEXT: popq %rax +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test6: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: subl $12, %esp +; KNL_X32-NEXT: Ltmp1: +; KNL_X32-NEXT: .cfi_def_cfa_offset 16 +; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; KNL_X32-NEXT: vpbroadcastd LCPI5_0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 +; KNL_X32-NEXT: calll L_func16xi1$stub +; KNL_X32-NEXT: vpmovzxbd %xmm0, %zmm0 +; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL_X32-NEXT: vpsrad $31, %zmm0, %zmm0 +; KNL_X32-NEXT: addl $12, %esp +; KNL_X32-NEXT: retl %cmpRes = icmp sgt <16 x i32>%a, %b %resi = call <16 x i1> @func16xi1(<16 x i1> %cmpRes) %res = sext <16 x i1>%resi to <16 x i32> @@ -73,82 +221,265 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { } declare <4 x i1> @func4xi1(<4 x i1> %a) -; SKX-LABEL: test7 -; SKX: vpmovm2d -; SKX: call -; SKX: vpslld $31, %xmm -; SKX: vpsrad $31, %xmm define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) { +; KNL-LABEL: test7: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: Ltmp2: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: callq _func4xi1 +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: popq %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test7: +; SKX: ## BB#0: +; SKX-NEXT: pushq %rax +; SKX-NEXT: Ltmp2: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: callq _func4xi1 +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpsrad $31, %xmm0, %xmm0 +; SKX-NEXT: popq %rax +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test7: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: subl $12, %esp +; KNL_X32-NEXT: Ltmp2: +; KNL_X32-NEXT: .cfi_def_cfa_offset 16 +; KNL_X32-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT: calll L_func4xi1$stub +; KNL_X32-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL_X32-NEXT: addl $12, %esp +; KNL_X32-NEXT: retl %cmpRes = icmp sgt <4 x i32>%a, %b %resi = call <4 x i1> @func4xi1(<4 x i1> %cmpRes) %res = sext <4 x i1>%resi to <4 x i32> ret <4 x i32> %res } -; SKX-LABEL: test7a -; SKX: call -; SKX: vpmovw2m %xmm0, %k0 -; SKX: kandb define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { +; KNL-LABEL: test7a: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: Ltmp3: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: callq _func8xi1 +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: movb $85, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: popq %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test7a: +; SKX: ## BB#0: +; SKX-NEXT: pushq %rax +; SKX-NEXT: Ltmp3: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: callq _func8xi1 +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k0 +; SKX-NEXT: movb $85, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: popq %rax +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test7a: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: subl $12, %esp +; KNL_X32-NEXT: Ltmp3: +; KNL_X32-NEXT: .cfi_def_cfa_offset 16 +; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 +; KNL_X32-NEXT: calll L_func8xi1$stub +; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL_X32-NEXT: vpsllvq LCPI7_0, %zmm0, %zmm0 +; KNL_X32-NEXT: movb $85, %al +; KNL_X32-NEXT: movzbl %al, %eax +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} +; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0 +; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 +; KNL_X32-NEXT: addl $12, %esp +; KNL_X32-NEXT: retl %cmpRes = icmp sgt <8 x i32>%a, %b %resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes) %res = and <8 x i1>%resi, ret <8 x i1> %res } - -; KNL_X32-LABEL: test8 -; KNL_X32: testb $1, 4(%esp) -; KNL_X32:jne - -; KNL-LABEL: test8 -; KNL: testb $1, %dil -; KNL:jne - define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) { +; ALL_X64-LABEL: test8: +; ALL_X64: ## BB#0: +; ALL_X64-NEXT: testb $1, %dil +; ALL_X64-NEXT: jne LBB8_2 +; ALL_X64-NEXT: ## BB#1: +; ALL_X64-NEXT: vmovaps %zmm1, %zmm0 +; ALL_X64-NEXT: LBB8_2: +; ALL_X64-NEXT: retq +; +; KNL_X32-LABEL: test8: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: jne LBB8_2 +; KNL_X32-NEXT: ## BB#1: +; KNL_X32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_X32-NEXT: LBB8_2: +; KNL_X32-NEXT: retl %res = select i1 %cond, <16 x i8> %a1, <16 x i8> %a2 ret <16 x i8> %res } -; KNL-LABEL: test9 -; KNL: vucomisd -; KNL: setb define i1 @test9(double %a, double %b) { +; ALL_X64-LABEL: test9: +; ALL_X64: ## BB#0: +; ALL_X64-NEXT: vucomisd %xmm0, %xmm1 +; ALL_X64-NEXT: setb %al +; ALL_X64-NEXT: retq +; +; KNL_X32-LABEL: test9: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 +; KNL_X32-NEXT: vucomisd {{[0-9]+}}(%esp), %xmm0 +; KNL_X32-NEXT: setb %al +; KNL_X32-NEXT: retl %c = fcmp ugt double %a, %b ret i1 %c } -; KNL_X32-LABEL: test10 -; KNL_X32: testb $1, 12(%esp) -; KNL_X32: cmovnel - -; KNL-LABEL: test10 -; KNL: testb $1, %dl -; KNL: cmovel define i32 @test10(i32 %a, i32 %b, i1 %cond) { +; ALL_X64-LABEL: test10: +; ALL_X64: ## BB#0: +; ALL_X64-NEXT: testb $1, %dl +; ALL_X64-NEXT: cmovel %esi, %edi +; ALL_X64-NEXT: movl %edi, %eax +; ALL_X64-NEXT: retq +; +; KNL_X32-LABEL: test10: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; KNL_X32-NEXT: cmovnel %eax, %ecx +; KNL_X32-NEXT: movl (%ecx), %eax +; KNL_X32-NEXT: retl %c = select i1 %cond, i32 %a, i32 %b ret i32 %c } -; KNL-LABEL: test11 -; KNL: cmp -; KNL: setg define i1 @test11(i32 %a, i32 %b) { +; ALL_X64-LABEL: test11: +; ALL_X64: ## BB#0: +; ALL_X64-NEXT: cmpl %esi, %edi +; ALL_X64-NEXT: setg %al +; ALL_X64-NEXT: retq +; +; KNL_X32-LABEL: test11: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: setg %al +; KNL_X32-NEXT: retl %c = icmp sgt i32 %a, %b ret i1 %c } -; KNL-LABEL: test12 -; KNL: callq _test11 -;; return value in %al -; KNL: movzbl %al, %ebx -; KNL: callq _test10 -; KNL: testb $1, %bl - define i32 @test12(i32 %a1, i32 %a2, i32 %b1) { +; ALL_X64-LABEL: test12: +; ALL_X64: ## BB#0: +; ALL_X64-NEXT: pushq %rbp +; ALL_X64-NEXT: Ltmp4: +; ALL_X64-NEXT: .cfi_def_cfa_offset 16 +; ALL_X64-NEXT: pushq %r14 +; ALL_X64-NEXT: Ltmp5: +; ALL_X64-NEXT: .cfi_def_cfa_offset 24 +; ALL_X64-NEXT: pushq %rbx +; ALL_X64-NEXT: Ltmp6: +; ALL_X64-NEXT: .cfi_def_cfa_offset 32 +; ALL_X64-NEXT: Ltmp7: +; ALL_X64-NEXT: .cfi_offset %rbx, -32 +; ALL_X64-NEXT: Ltmp8: +; ALL_X64-NEXT: .cfi_offset %r14, -24 +; ALL_X64-NEXT: Ltmp9: +; ALL_X64-NEXT: .cfi_offset %rbp, -16 +; ALL_X64-NEXT: movl %esi, %r14d +; ALL_X64-NEXT: movl %edi, %ebp +; ALL_X64-NEXT: movl %edx, %esi +; ALL_X64-NEXT: callq _test11 +; ALL_X64-NEXT: movzbl %al, %ebx +; ALL_X64-NEXT: movl %ebp, %edi +; ALL_X64-NEXT: movl %r14d, %esi +; ALL_X64-NEXT: movl %ebx, %edx +; ALL_X64-NEXT: callq _test10 +; ALL_X64-NEXT: xorl %ecx, %ecx +; ALL_X64-NEXT: testb $1, %bl +; ALL_X64-NEXT: cmovel %ecx, %eax +; ALL_X64-NEXT: popq %rbx +; ALL_X64-NEXT: popq %r14 +; ALL_X64-NEXT: popq %rbp +; ALL_X64-NEXT: retq +; +; KNL_X32-LABEL: test12: +; KNL_X32: ## BB#0: +; KNL_X32-NEXT: pushl %ebx +; KNL_X32-NEXT: Ltmp4: +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: pushl %edi +; KNL_X32-NEXT: Ltmp5: +; KNL_X32-NEXT: .cfi_def_cfa_offset 12 +; KNL_X32-NEXT: pushl %esi +; KNL_X32-NEXT: Ltmp6: +; KNL_X32-NEXT: .cfi_def_cfa_offset 16 +; KNL_X32-NEXT: subl $16, %esp +; KNL_X32-NEXT: Ltmp7: +; KNL_X32-NEXT: .cfi_def_cfa_offset 32 +; KNL_X32-NEXT: Ltmp8: +; KNL_X32-NEXT: .cfi_offset %esi, -16 +; KNL_X32-NEXT: Ltmp9: +; KNL_X32-NEXT: .cfi_offset %edi, -12 +; KNL_X32-NEXT: Ltmp10: +; KNL_X32-NEXT: .cfi_offset %ebx, -8 +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl %edi, (%esp) +; KNL_X32-NEXT: calll _test11 +; KNL_X32-NEXT: movb %al, %bl +; KNL_X32-NEXT: movzbl %bl, %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl %edi, (%esp) +; KNL_X32-NEXT: calll _test10 +; KNL_X32-NEXT: xorl %ecx, %ecx +; KNL_X32-NEXT: testb $1, %bl +; KNL_X32-NEXT: cmovel %ecx, %eax +; KNL_X32-NEXT: addl $16, %esp +; KNL_X32-NEXT: popl %esi +; KNL_X32-NEXT: popl %edi +; KNL_X32-NEXT: popl %ebx +; KNL_X32-NEXT: retl %cond = call i1 @test11(i32 %a1, i32 %b1) %res = call i32 @test10(i32 %a1, i32 %a2, i1 %cond) %res1 = select i1 %cond, i32 %res, i32 0 ret i32 %res1 -} \ No newline at end of file +} diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index aa1dd4928c3..bc150968447 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -1,851 +1,1388 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX - - ;SKX-LABEL: zext_8x8mem_to_8x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX + define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x8mem_to_8x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x8mem_to_8x16: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = zext <8 x i8> %a to <8 x i16> - %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer + %x = zext <8 x i8> %a to <8 x i16> + %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer ret <8 x i16> %ret } -;SKX-LABEL: sext_8x8mem_to_8x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_8x8mem_to_8x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbw (%rdi), %xmm1 +; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8x8mem_to_8x16: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = sext <8 x i8> %a to <8 x i16> - %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer + %x = sext <8 x i8> %a to <8 x i16> + %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer ret <8 x i16> %ret } -;SKX-LABEL: zext_16x8mem_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbw (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq + define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_16x8mem_to_16x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_16x8mem_to_16x16: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbw (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <16 x i8>,<16 x i8> *%i,align 1 - %x = zext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer + %x = zext <16 x i8> %a to <16 x i16> + %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } -;SKX-LABEL: sext_16x8mem_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_16x8mem_to_16x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpmovsxbw (%rdi), %ymm1 +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_16x8mem_to_16x16: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <16 x i8>,<16 x i8> *%i,align 1 - %x = sext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer + %x = sext <16 x i8> %a to <16 x i16> + %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } -;SKX-LABEL: zext_16x8_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxbw %xmm0, %ymm0 -;SKX-NEXT: retq -define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { - %x = zext <16 x i8> %a to <16 x i16> +define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { +; KNL-LABEL: zext_16x8_to_16x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: retq +; +; SKX-LABEL: zext_16x8_to_16x16: +; SKX: ## BB#0: +; SKX-NEXT: vpmovzxbw %xmm0, %ymm0 +; SKX-NEXT: retq + %x = zext <16 x i8> %a to <16 x i16> ret <16 x i16> %x } -;SKX-LABEL: zext_16x8_to_16x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm1, %k1 -;SKX-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { - %x = zext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer +define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_16x8_to_16x16_mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_16x8_to_16x16_mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 +; SKX-NEXT: vpmovb2m %xmm1, %k1 +; SKX-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq + %x = zext <16 x i8> %a to <16 x i16> + %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } -;SKX-LABEL: sext_16x8_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbw %xmm0, %ymm0 -;SKX-NEXT: retq -define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { - %x = sext <16 x i8> %a to <16 x i16> +define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { +; ALL-LABEL: sext_16x8_to_16x16: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxbw %xmm0, %ymm0 +; ALL-NEXT: retq + %x = sext <16 x i8> %a to <16 x i16> ret <16 x i16> %x } -;SKX-LABEL: sext_16x8_to_16x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm1, %k1 -;SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { - %x = sext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer +define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_16x8_to_16x16_mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpmovsxbw %xmm0, %ymm0 +; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_16x8_to_16x16_mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 +; SKX-NEXT: vpmovb2m %xmm1, %k1 +; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq + %x = sext <16 x i8> %a to <16 x i16> + %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } -;SKX-LABEL: zext_32x8mem_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm0, %k1 -;SKX-NEXT: vpmovzxbw (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_32x8mem_to_32x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpsllw $15, %ymm3, %ymm3 +; KNL-NEXT: vpsraw $15, %ymm3, %ymm3 +; KNL-NEXT: vpand %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1 +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_32x8mem_to_32x16: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 +; SKX-NEXT: vpmovb2m %ymm0, %k1 +; SKX-NEXT: vpmovzxbw (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <32 x i8>,<32 x i8> *%i,align 1 - %x = zext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer + %x = zext <32 x i8> %a to <32 x i16> + %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } -;SKX-LABEL: sext_32x8mem_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm0, %k1 -;SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_32x8mem_to_32x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm1 +; KNL-NEXT: vpmovsxbw (%rdi), %ymm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpsllw $15, %ymm3, %ymm3 +; KNL-NEXT: vpsraw $15, %ymm3, %ymm3 +; KNL-NEXT: vpand %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1 +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_32x8mem_to_32x16: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 +; SKX-NEXT: vpmovb2m %ymm0, %k1 +; SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <32 x i8>,<32 x i8> *%i,align 1 - %x = sext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer + %x = sext <32 x i8> %a to <32 x i16> + %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } -;SKX-LABEL: zext_32x8_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxbw %ymm0, %zmm0 -;SKX-NEXT: retq -define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { - %x = zext <32 x i8> %a to <32 x i16> +define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { +; KNL-LABEL: zext_32x8_to_32x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_32x8_to_32x16: +; SKX: ## BB#0: +; SKX-NEXT: vpmovzxbw %ymm0, %zmm0 +; SKX-NEXT: retq + %x = zext <32 x i8> %a to <32 x i16> ret <32 x i16> %x } -;SKX-LABEL: zext_32x8_to_32x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm1, %k1 -;SKX-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { +define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_32x8_to_32x16_mask: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpsllw $15, %ymm3, %ymm3 +; KNL-NEXT: vpsraw $15, %ymm3, %ymm3 +; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_32x8_to_32x16_mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 +; SKX-NEXT: vpmovb2m %ymm1, %k1 +; SKX-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %x = zext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer + %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } -;SKX-LABEL: sext_32x8_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbw %ymm0, %zmm0 -;SKX-NEXT: retq -define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { - %x = sext <32 x i8> %a to <32 x i16> +define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { +; KNL-LABEL: sext_32x8_to_32x16: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbw %xmm0, %ymm2 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpmovsxbw %xmm0, %ymm1 +; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_32x8_to_32x16: +; SKX: ## BB#0: +; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 +; SKX-NEXT: retq + %x = sext <32 x i8> %a to <32 x i16> ret <32 x i16> %x } -;SKX-LABEL: sext_32x8_to_32x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm1, %k1 -;SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { +define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_32x8_to_32x16_mask: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL-NEXT: vpmovsxbw %xmm2, %ymm2 +; KNL-NEXT: vpmovsxbw %xmm0, %ymm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpsllw $15, %ymm3, %ymm3 +; KNL-NEXT: vpsraw $15, %ymm3, %ymm3 +; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_32x8_to_32x16_mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 +; SKX-NEXT: vpmovb2m %ymm1, %k1 +; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %x = sext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer + %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } -;SKX-LABEL: zext_4x8mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_4x8mem_to_4x32: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4x8mem_to_4x32: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 - %x = zext <4 x i8> %a to <4 x i32> - %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer + %x = zext <4 x i8> %a to <4 x i32> + %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } -;SKX-LABEL: sext_4x8mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_4x8mem_to_4x32: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd (%rdi), %xmm1 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_4x8mem_to_4x32: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 - %x = sext <4 x i8> %a to <4 x i32> - %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer + %x = sext <4 x i8> %a to <4 x i32> + %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } -;SKX-LABEL: zext_8x8mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x8mem_to_8x32: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x8mem_to_8x32: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbd (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = zext <8 x i8> %a to <8 x i32> - %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer + %x = zext <8 x i8> %a to <8 x i32> + %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } -;SKX-LABEL: sext_8x8mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_8x8mem_to_8x32: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovsxbd (%rdi), %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8x8mem_to_8x32: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = sext <8 x i8> %a to <8 x i32> - %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer + %x = sext <8 x i8> %a to <8 x i32> + %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } -;KNL-LABEL: zext_16x8mem_to_16x32: -;KNL: vpmovzxbd (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_16x8mem_to_16x32: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_16x8mem_to_16x32: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = zext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } -;KNL-LABEL: sext_16x8mem_to_16x32: -;KNL: vpmovsxbd (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_16x8mem_to_16x32: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_16x8mem_to_16x32: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = sext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } -;KNL-LABEL: zext_16x8_to_16x32_mask: -;KNL: vpmovzxbd %xmm0, %zmm0 {%k1} {z} -;KNL-NEXT: retq define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_16x8_to_16x32_mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_16x8_to_16x32_mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 +; SKX-NEXT: vpmovb2m %xmm1, %k1 +; SKX-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %x = zext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } -;KNL-LABEL: sext_16x8_to_16x32_mask: -;KNL: vpmovsxbd %xmm0, %zmm0 {%k1} {z} -;KNL-NEXT: retq define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_16x8_to_16x32_mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_16x8_to_16x32_mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 +; SKX-NEXT: vpmovb2m %xmm1, %k1 +; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %x = sext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } -; KNL-LABEL: zext_16x8_to_16x32 -; KNL: vpmovzxbd {{.*}}%zmm -; KNL: ret define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { +; ALL-LABEL: zext_16x8_to_16x32: +; ALL: ## BB#0: +; ALL-NEXT: vpmovzxbd %xmm0, %zmm0 +; ALL-NEXT: retq %x = zext <16 x i8> %i to <16 x i32> ret <16 x i32> %x } -; KNL-LABEL: sext_16x8_to_16x32 -; KNL: vpmovsxbd {{.*}}%zmm -; KNL: ret define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { +; ALL-LABEL: sext_16x8_to_16x32: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxbd %xmm0, %zmm0 +; ALL-NEXT: retq %x = sext <16 x i8> %i to <16 x i32> ret <16 x i32> %x } -;SKX-LABEL: zext_2x8mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_2x8mem_to_2x64: +; KNL: ## BB#0: +; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_2x8mem_to_2x64: +; SKX: ## BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = zext <2 x i8> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } -;SKX-LABEL: sext_2x8mem_to_2x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_2x8mem_to_2x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpmovsxbq (%rdi), %xmm1 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_2x8mem_to_2x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = sext <2 x i8> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } -;SKX-LABEL: sext_2x8mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbq (%rdi), %xmm0 -;SKX-NEXT: retq define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone { +; ALL-LABEL: sext_2x8mem_to_2x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxbq (%rdi), %xmm0 +; ALL-NEXT: retq %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = sext <2 x i8> %a to <2 x i64> ret <2 x i64> %x } -;SKX-LABEL: zext_4x8mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_4x8mem_to_4x64: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4x8mem_to_4x64: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = zext <4 x i8> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } -;SKX-LABEL: sext_4x8mem_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_4x8mem_to_4x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovsxbq (%rdi), %ymm1 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_4x8mem_to_4x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } -;SKX-LABEL: sext_4x8mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbq (%rdi), %ymm0 -;SKX-NEXT: retq define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone { +; ALL-LABEL: sext_4x8mem_to_4x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxbq (%rdi), %ymm0 +; ALL-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i64> ret <4 x i64> %x } -;KNL-LABEL: zext_8x8mem_to_8x64: -;KNL: vpmovzxbq (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x8mem_to_8x64: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x8mem_to_8x64: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = zext <8 x i8> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;KNL-LABEL: sext_8x8mem_to_8x64mask: -;KNL: vpmovsxbq (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_8x8mem_to_8x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8x8mem_to_8x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;KNL-LABEL: sext_8x8mem_to_8x64: -;KNL: vpmovsxbq (%rdi), %zmm0 -;KNL-NEXT: retq define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone { +; ALL-LABEL: sext_8x8mem_to_8x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxbq (%rdi), %zmm0 +; ALL-NEXT: retq %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i64> ret <8 x i64> %x } -;SKX-LABEL: zext_4x16mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_4x16mem_to_4x32: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4x16mem_to_4x32: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = zext <4 x i16> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } -;SKX-LABEL: sext_4x16mem_to_4x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_4x16mem_to_4x32mask: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxwd (%rdi), %xmm1 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_4x16mem_to_4x32mask: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } -;SKX-LABEL: sext_4x16mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwd (%rdi), %xmm0 -;SKX-NEXT: retq define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone { +; ALL-LABEL: sext_4x16mem_to_4x32: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxwd (%rdi), %xmm0 +; ALL-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i32> ret <4 x i32> %x } -;SKX-LABEL: zext_8x16mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x16mem_to_8x32: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x16mem_to_8x32: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovzxwd (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = zext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } -;SKX-LABEL: sext_8x16mem_to_8x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_8x16mem_to_8x32mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovsxwd (%rdi), %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8x16mem_to_8x32mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } -;SKX-LABEL: sext_8x16mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwd (%rdi), %ymm0 -;SKX-NEXT: retq define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone { +; ALL-LABEL: sext_8x16mem_to_8x32: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxwd (%rdi), %ymm0 +; ALL-NEXT: retq %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %x } -;SKX-LABEL: zext_8x16_to_8x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm1, %k1 -;SKX-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x16_to_8x32mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x16_to_8x32mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k1 +; SKX-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq %x = zext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } -;SKX-LABEL: zext_8x16_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxwd %xmm0, %ymm0 -;SKX-NEXT: retq define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone { +; KNL-LABEL: zext_8x16_to_8x32: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x16_to_8x32: +; SKX: ## BB#0: +; SKX-NEXT: vpmovzxwd %xmm0, %ymm0 +; SKX-NEXT: retq %x = zext <8 x i16> %a to <8 x i32> ret <8 x i32> %x } -;SKX-LABEL: zext_16x16mem_to_16x32: -;KNL-LABEL: zext_16x16mem_to_16x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovzxwd (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_16x16mem_to_16x32: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_16x16mem_to_16x32: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = zext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } -;SKX-LABEL: sext_16x16mem_to_16x32mask: -;KNL-LABEL: sext_16x16mem_to_16x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovsxwd (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_16x16mem_to_16x32mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_16x16mem_to_16x32mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = sext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } -;SKX-LABEL: sext_16x16mem_to_16x32: -;KNL-LABEL: sext_16x16mem_to_16x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwd (%rdi), %zmm0 -;KNL: vpmovsxwd (%rdi), %zmm0 -;SKX-NEXT: retq define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone { +; ALL-LABEL: sext_16x16mem_to_16x32: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxwd (%rdi), %zmm0 +; ALL-NEXT: retq %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = sext <16 x i16> %a to <16 x i32> ret <16 x i32> %x } -;SKX-LABEL: zext_16x16_to_16x32mask: -;KNL-LABEL: zext_16x16_to_16x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm1, %k1 -;SKX-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} -;KNL: vpmovzxwd %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_16x16_to_16x32mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_16x16_to_16x32mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 +; SKX-NEXT: vpmovb2m %xmm1, %k1 +; SKX-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %x = zext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } -;SKX-LABEL: zext_16x16_to_16x32: -;KNL-LABEL: zext_16x16_to_16x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxwd %ymm0, %zmm0 -;KNL: vpmovzxwd %ymm0, %zmm0 -;SKX-NEXT: retq define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone { +; ALL-LABEL: zext_16x16_to_16x32: +; ALL: ## BB#0: +; ALL-NEXT: vpmovzxwd %ymm0, %zmm0 +; ALL-NEXT: retq %x = zext <16 x i16> %a to <16 x i32> ret <16 x i32> %x } -;SKX-LABEL: zext_2x16mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_2x16mem_to_2x64: +; KNL: ## BB#0: +; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_2x16mem_to_2x64: +; SKX: ## BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = zext <2 x i16> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } -;SKX-LABEL: sext_2x16mem_to_2x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_2x16mem_to_2x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpmovsxwq (%rdi), %xmm1 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_2x16mem_to_2x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = sext <2 x i16> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } -;SKX-LABEL: sext_2x16mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwq (%rdi), %xmm0 -;SKX-NEXT: retq define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone { +; ALL-LABEL: sext_2x16mem_to_2x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxwq (%rdi), %xmm0 +; ALL-NEXT: retq %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = sext <2 x i16> %a to <2 x i64> ret <2 x i64> %x } -;SKX-LABEL: zext_4x16mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_4x16mem_to_4x64: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4x16mem_to_4x64: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = zext <4 x i16> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } -;SKX-LABEL: sext_4x16mem_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_4x16mem_to_4x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovsxwq (%rdi), %ymm1 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_4x16mem_to_4x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } -;SKX-LABEL: sext_4x16mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwq (%rdi), %ymm0 -;SKX-NEXT: retq define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone { +; ALL-LABEL: sext_4x16mem_to_4x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxwq (%rdi), %ymm0 +; ALL-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i64> ret <4 x i64> %x } -;SKX-LABEL: zext_8x16mem_to_8x64: -;KNL-LABEL: zext_8x16mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovzxwq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x16mem_to_8x64: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x16mem_to_8x64: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = zext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;SKX-LABEL: sext_8x16mem_to_8x64mask: -;KNL-LABEL: sext_8x16mem_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovsxwq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_8x16mem_to_8x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8x16mem_to_8x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;SKX-LABEL: sext_8x16mem_to_8x64: -;KNL-LABEL: sext_8x16mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwq (%rdi), %zmm0 -;KNL: vpmovsxwq (%rdi), %zmm0 -;SKX-NEXT: retq define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone { +; ALL-LABEL: sext_8x16mem_to_8x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxwq (%rdi), %zmm0 +; ALL-NEXT: retq %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i64> ret <8 x i64> %x } -;SKX-LABEL: zext_8x16_to_8x64mask: -;KNL-LABEL: zext_8x16_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm1, %k1 -;SKX-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} -;KNL: vpmovzxwq %xmm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x16_to_8x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x16_to_8x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k1 +; SKX-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %x = zext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;SKX-LABEL: zext_8x16_to_8x64: -;KNL-LABEL: zext_8x16_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxwq %xmm0, %zmm0 -;KNL: vpmovzxwq %xmm0, %zmm0 -;SKX-NEXT: retq -; KNL: ret define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone { +; ALL-LABEL: zext_8x16_to_8x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovzxwq %xmm0, %zmm0 +; ALL-NEXT: retq %ret = zext <8 x i16> %a to <8 x i64> ret <8 x i64> %ret } -;SKX-LABEL: zext_2x32mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_2x32mem_to_2x64: +; KNL: ## BB#0: +; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_2x32mem_to_2x64: +; SKX: ## BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = zext <2 x i32> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } -;SKX-LABEL: sext_2x32mem_to_2x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_2x32mem_to_2x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpmovsxdq (%rdi), %xmm1 +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_2x32mem_to_2x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = sext <2 x i32> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } -;SKX-LABEL: sext_2x32mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq (%rdi), %xmm0 -;SKX-NEXT: retq define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone { +; ALL-LABEL: sext_2x32mem_to_2x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxdq (%rdi), %xmm0 +; ALL-NEXT: retq %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = sext <2 x i32> %a to <2 x i64> ret <2 x i64> %x } -;SKX-LABEL: zext_4x32mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_4x32mem_to_4x64: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4x32mem_to_4x64: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = zext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } -;SKX-LABEL: sext_4x32mem_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_4x32mem_to_4x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovsxdq (%rdi), %ymm1 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_4x32mem_to_4x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} +; SKX-NEXT: retq %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = sext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } -;SKX-LABEL: sext_4x32mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq (%rdi), %ymm0 -;SKX-NEXT: retq define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone { +; ALL-LABEL: sext_4x32mem_to_4x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxdq (%rdi), %ymm0 +; ALL-NEXT: retq %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = sext <4 x i32> %a to <4 x i64> ret <4 x i64> %x } -;SKX-LABEL: sext_4x32_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq %xmm0, %ymm0 -;SKX-NEXT: retq define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone { +; ALL-LABEL: sext_4x32_to_4x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxdq %xmm0, %ymm0 +; ALL-NEXT: retq %x = sext <4 x i32> %a to <4 x i64> ret <4 x i64> %x } -;SKX-LABEL: zext_4x32_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm1, %k1 -;SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_4x32_to_4x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: zext_4x32_to_4x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq %x = zext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } -;SKX-LABEL: zext_8x32mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x32mem_to_8x64: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x32mem_to_8x64: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = zext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;SKX-LABEL: sext_8x32mem_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: sext_8x32mem_to_8x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8x32mem_to_8x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k1 +; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = sext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;SKX-LABEL: sext_8x32mem_to_8x64: -;KNL-LABEL: sext_8x32mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq (%rdi), %zmm0 -;KNL: vpmovsxdq (%rdi), %zmm0 -;SKX-NEXT: retq define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone { +; ALL-LABEL: sext_8x32mem_to_8x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxdq (%rdi), %zmm0 +; ALL-NEXT: retq %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = sext <8 x i32> %a to <8 x i64> ret <8 x i64> %x } -;SKX-LABEL: sext_8x32_to_8x64: -;KNL-LABEL: sext_8x32_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq %ymm0, %zmm0 -;KNL: vpmovsxdq %ymm0, %zmm0 -;SKX-NEXT: retq define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone { +; ALL-LABEL: sext_8x32_to_8x64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxdq %ymm0, %zmm0 +; ALL-NEXT: retq %x = sext <8 x i32> %a to <8 x i64> ret <8 x i64> %x } -;SKX-LABEL: zext_8x32_to_8x64mask: -;KNL-LABEL: zext_8x32_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm1, %k1 -;SKX-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} -;KNL: vpmovzxdq %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone { +; KNL-LABEL: zext_8x32_to_8x64mask: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 +; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8x32_to_8x64mask: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpmovw2m %xmm1, %k1 +; SKX-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %x = zext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } -;KNL-LABEL: fptrunc_test -;KNL: vcvtpd2ps {{.*}}%zmm -;KNL: ret define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone { +; ALL-LABEL: fptrunc_test: +; ALL: ## BB#0: +; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0 +; ALL-NEXT: retq %b = fptrunc <8 x double> %a to <8 x float> ret <8 x float> %b } -;KNL-LABEL: fpext_test -;KNL: vcvtps2pd {{.*}}%zmm -;KNL: ret define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone { +; ALL-LABEL: fpext_test: +; ALL: ## BB#0: +; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 +; ALL-NEXT: retq %b = fpext <8 x float> %a to <8 x double> ret <8 x double> %b } -; KNL-LABEL: zext_16i1_to_16xi32 -; KNL: vpbroadcastd LCP{{.*}}(%rip), %zmm0 {%k1} {z} -; KNL: ret define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { +; ALL-LABEL: zext_16i1_to_16xi32: +; ALL: ## BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; ALL-NEXT: retq %a = bitcast i16 %b to <16 x i1> %c = zext <16 x i1> %a to <16 x i32> ret <16 x i32> %c } -; KNL-LABEL: zext_8i1_to_8xi64 -; KNL: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z} -; KNL: ret define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { +; KNL-LABEL: zext_8i1_to_8xi64: +; KNL: ## BB#0: +; KNL-NEXT: movzbl %dil, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: zext_8i1_to_8xi64: +; SKX: ## BB#0: +; SKX-NEXT: kmovb %edi, %k1 +; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: retq %a = bitcast i8 %b to <8 x i1> %c = zext <8 x i1> %a to <8 x i64> ret <8 x i64> %c } -; KNL-LABEL: trunc_16i8_to_16i1 -; KNL: vpmovsxbd -; KNL: vpandd -; KNL: vptestmd -; KNL: ret -; SKX-LABEL: trunc_16i8_to_16i1 -; SKX: vpmovb2m %xmm define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { +; KNL-LABEL: trunc_16i8_to_16i1: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_16i8_to_16i1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: retq %mask_b = trunc <16 x i8>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask } -; KNL-LABEL: trunc_16i32_to_16i1 -; KNL: vpandd -; KNL: vptestmd -; KNL: ret -; SKX-LABEL: trunc_16i32_to_16i1 -; SKX: vpmovd2m %zmm define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { +; KNL-LABEL: trunc_16i32_to_16i1: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_16i32_to_16i1: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; SKX-NEXT: vpmovd2m %zmm0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: retq %mask_b = trunc <16 x i32>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask } -; SKX-LABEL: trunc_4i32_to_4i1 -; SKX: vpmovd2m %xmm -; SKX: kandw -; SKX: vpmovm2d define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { +; KNL-LABEL: trunc_4i32_to_4i1: +; KNL: ## BB#0: +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_4i32_to_4i1: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: vpslld $31, %xmm1, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: retq %mask_a = trunc <4 x i32>%a to <4 x i1> %mask_b = trunc <4 x i32>%b to <4 x i1> %a_and_b = and <4 x i1>%mask_a, %mask_b @@ -853,25 +1390,42 @@ define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { ret <4 x i32>%res } -; KNL-LABEL: trunc_8i16_to_8i1 -; KNL: vpmovsxwq -; KNL: vpandq LCP{{.*}}(%rip){1to8} -; KNL: vptestmq -; KNL: ret -; SKX-LABEL: trunc_8i16_to_8i1 -; SKX: vpmovw2m %xmm define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { +; KNL-LABEL: trunc_8i16_to_8i1: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_8i16_to_8i1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k0 +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: retq %mask_b = trunc <8 x i16>%a to <8 x i1> %mask = bitcast <8 x i1> %mask_b to i8 ret i8 %mask } -; KNL-LABEL: sext_8i1_8i32 -; KNL: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z} -; SKX: vpmovm2d -; KNL: ret define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { +; KNL-LABEL: sext_8i1_8i32: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; KNL-NEXT: knotw %k0, %k1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8i1_8i32: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; SKX-NEXT: knotb %k0, %k0 +; SKX-NEXT: vpmovm2d %k0, %ymm0 +; SKX-NEXT: retq %x = icmp slt <8 x i32> %a1, %a2 %x1 = xor <8 x i1>%x, %y = sext <8 x i1> %x1 to <8 x i32> @@ -879,59 +1433,403 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { } -; KNL-LABEL: trunc_i32_to_i1 -; KNL: movw $-4, %ax -; KNL: kmovw %eax, %k1 -; KNL: korw define i16 @trunc_i32_to_i1(i32 %a) { +; ALL-LABEL: trunc_i32_to_i1: +; ALL: ## BB#0: +; ALL-NEXT: andl $1, %edi +; ALL-NEXT: kmovw %edi, %k0 +; ALL-NEXT: movw $-4, %ax +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: korw %k0, %k1, %k0 +; ALL-NEXT: kmovw %k0, %eax +; ALL-NEXT: retq %a_i = trunc i32 %a to i1 %maskv = insertelement <16 x i1> , i1 %a_i, i32 0 %res = bitcast <16 x i1> %maskv to i16 ret i16 %res } -; KNL-LABEL: sext_8i1_8i16 -; SKX: vpmovm2w -; KNL: ret define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { +; KNL-LABEL: sext_8i1_8i16: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8i1_8i16: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: retq %x = icmp slt <8 x i32> %a1, %a2 %y = sext <8 x i1> %x to <8 x i16> ret <8 x i16> %y } -; KNL-LABEL: sext_16i1_16i32 -; SKX: vpmovm2d -; KNL: ret define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { +; KNL-LABEL: sext_16i1_16i32: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: sext_16i1_16i32: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: retq %x = icmp slt <16 x i32> %a1, %a2 %y = sext <16 x i1> %x to <16 x i32> ret <16 x i32> %y } -; KNL-LABEL: sext_8i1_8i64 -; SKX: vpmovm2q -; KNL: ret define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { +; KNL-LABEL: sext_8i1_8i64: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sext_8i1_8i64: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; SKX-NEXT: vpmovm2q %k0, %zmm0 +; SKX-NEXT: retq %x = icmp slt <8 x i32> %a1, %a2 %y = sext <8 x i1> %x to <8 x i64> ret <8 x i64> %y } -; KNL-LABEL: @extload_v8i64 -; KNL: vpmovsxbq define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { +; ALL-LABEL: extload_v8i64: +; ALL: ## BB#0: +; ALL-NEXT: vpmovsxbq (%rdi), %zmm0 +; ALL-NEXT: vmovdqa64 %zmm0, (%rsi) +; ALL-NEXT: retq %sign_load = load <8 x i8>, <8 x i8>* %a %c = sext <8 x i8> %sign_load to <8 x i64> store <8 x i64> %c, <8 x i64>* %res ret void } -;SKX-LABEL: test21: -;SKX: vmovdqu16 %zmm0, %zmm3 {%k1} -;SKX-NEXT: kshiftrq $32, %k1, %k1 -;SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { +; KNL-LABEL: test21: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: pushq %r15 +; KNL-NEXT: pushq %r14 +; KNL-NEXT: pushq %r13 +; KNL-NEXT: pushq %r12 +; KNL-NEXT: pushq %rbx +; KNL-NEXT: vpmovsxbd %xmm7, %zmm7 +; KNL-NEXT: vpslld $31, %zmm7, %zmm7 +; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 +; KNL-NEXT: vpslld $31, %zmm6, %zmm6 +; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 +; KNL-NEXT: vpslld $31, %zmm5, %zmm5 +; KNL-NEXT: vpmovsxbd %xmm4, %zmm4 +; KNL-NEXT: vpslld $31, %zmm4, %zmm4 +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vmovd %eax, %xmm4 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: kshiftlw $15, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $3, %edi, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edi +; KNL-NEXT: kshiftlw $12, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $4, %esi, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: kshiftlw $11, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $5, %r13d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r13d +; KNL-NEXT: kshiftlw $10, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $6, %r8d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: kshiftlw $9, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $7, %r10d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: kshiftlw $8, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $8, %r11d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r11d +; KNL-NEXT: kshiftlw $7, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $9, %ebx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ebx +; KNL-NEXT: kshiftlw $6, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $10, %ebp, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ebp +; KNL-NEXT: kshiftlw $5, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $11, %r14d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r14d +; KNL-NEXT: kshiftlw $4, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $12, %r15d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r15d +; KNL-NEXT: kshiftlw $3, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $13, %r9d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: kshiftlw $2, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $14, %r12d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: kshiftlw $1, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: vptestmd %zmm6, %zmm6, %k0 +; KNL-NEXT: kshiftlw $0, %k1, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vmovd %ecx, %xmm5 +; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $2, %edi, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $5, %r8d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $6, %r10d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $7, %r11d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $8, %ebx, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $9, %ebp, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $10, %r14d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $11, %r15d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $13, %r12d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $14, %r9d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vpinsrb $15, %edx, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: vptestmd %zmm7, %zmm7, %k1 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vmovd %eax, %xmm6 +; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $15, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $2, %edi, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $3, %ecx, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $12, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: kshiftlw $11, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $5, %r13d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r13d +; KNL-NEXT: kshiftlw $10, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k0, %edi +; KNL-NEXT: kshiftlw $9, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $7, %ebx, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ebx +; KNL-NEXT: kshiftlw $8, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $8, %ebp, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ebp +; KNL-NEXT: kshiftlw $7, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $9, %r10d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: kshiftlw $6, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $10, %r11d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r11d +; KNL-NEXT: kshiftlw $5, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $11, %esi, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: kshiftlw $4, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r14d +; KNL-NEXT: kshiftlw $3, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $13, %r9d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: kshiftlw $2, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $14, %r15d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r15d +; KNL-NEXT: kshiftlw $1, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $15, %r12d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: kshiftlw $0, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vmovd %edx, %xmm7 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $1, %eax, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $2, %ecx, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $3, %r8d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $5, %edi, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $6, %ebx, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $7, %ebp, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $9, %r11d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $10, %esi, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $11, %r14d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $12, %r9d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $13, %r15d, %xmm7, %xmm7 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 +; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 +; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 +; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 +; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero +; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 +; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 +; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 +; KNL-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm4 +; KNL-NEXT: vpinsrb $15, %edx, %xmm4, %xmm4 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 +; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 +; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; KNL-NEXT: popq %rbx +; KNL-NEXT: popq %r12 +; KNL-NEXT: popq %r13 +; KNL-NEXT: popq %r14 +; KNL-NEXT: popq %r15 +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test21: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %zmm2, %zmm2 +; SKX-NEXT: vpmovb2m %zmm2, %k1 +; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; SKX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; SKX-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} +; SKX-NEXT: kshiftrq $32, %k1, %k1 +; SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; SKX-NEXT: vmovaps %zmm3, %zmm0 +; SKX-NEXT: vmovaps %zmm2, %zmm1 +; SKX-NEXT: retq %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer ret <64 x i16> %ret -} +} diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll index ed046de005c..9279441a23c 100644 --- a/test/CodeGen/X86/avx512-fma.ll +++ b/test/CodeGen/X86/avx512-fma.ll @@ -1,81 +1,93 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=SKX +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=SKX -; CHECK-LABEL: test_x86_fmadd_ps_z -; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 -; CHECK: ret define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; ALL-LABEL: test_x86_fmadd_ps_z: +; ALL: ## BB#0: +; ALL-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 +; ALL-NEXT: retq %x = fmul <16 x float> %a0, %a1 %res = fadd <16 x float> %x, %a2 ret <16 x float> %res } -; CHECK-LABEL: test_x86_fmsub_ps_z -; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0 -; CHECK: ret define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; ALL-LABEL: test_x86_fmsub_ps_z: +; ALL: ## BB#0: +; ALL-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0 +; ALL-NEXT: retq %x = fmul <16 x float> %a0, %a1 %res = fsub <16 x float> %x, %a2 ret <16 x float> %res } -; CHECK-LABEL: test_x86_fnmadd_ps_z -; CHECK: vfnmadd213ps %zmm2, %zmm1, %zmm0 -; CHECK: ret define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; ALL-LABEL: test_x86_fnmadd_ps_z: +; ALL: ## BB#0: +; ALL-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 +; ALL-NEXT: retq %x = fmul <16 x float> %a0, %a1 %res = fsub <16 x float> %a2, %x ret <16 x float> %res } -; CHECK-LABEL: test_x86_fnmsub_ps_z -; CHECK: vfnmsub213ps %zmm2, %zmm1, %zmm0 -; CHECK: ret define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { +; ALL-LABEL: test_x86_fnmsub_ps_z: +; ALL: ## BB#0: +; ALL-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 +; ALL-NEXT: retq %x = fmul <16 x float> %a0, %a1 - %y = fsub <16 x float> , %x + float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, + float -0.000000e+00>, %x %res = fsub <16 x float> %y, %a2 ret <16 x float> %res } -; CHECK-LABEL: test_x86_fmadd_pd_z -; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 -; CHECK: ret define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; ALL-LABEL: test_x86_fmadd_pd_z: +; ALL: ## BB#0: +; ALL-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 +; ALL-NEXT: retq %x = fmul <8 x double> %a0, %a1 %res = fadd <8 x double> %x, %a2 ret <8 x double> %res } -; CHECK-LABEL: test_x86_fmsub_pd_z -; CHECK: vfmsub213pd %zmm2, %zmm1, %zmm0 -; CHECK: ret define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { +; ALL-LABEL: test_x86_fmsub_pd_z: +; ALL: ## BB#0: +; ALL-NEXT: vfmsub213pd %zmm2, %zmm1, %zmm0 +; ALL-NEXT: retq %x = fmul <8 x double> %a0, %a1 %res = fsub <8 x double> %x, %a2 ret <8 x double> %res } define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) { -; CHECK-LABEL: test_x86_fmsub_213: -; CHECK: ## BB#0: -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq +; ALL-LABEL: test_x86_fmsub_213: +; ALL: ## BB#0: +; ALL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %x = fmul double %a0, %a1 %res = fsub double %x, %a2 ret double %res } define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { -; CHECK-LABEL: test_x86_fmsub_213_m: -; CHECK: ## BB#0: -; CHECK-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test_x86_fmsub_213_m: +; KNL: ## BB#0: +; KNL-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1 +; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_x86_fmsub_213_m: +; SKX: ## BB#0: +; SKX-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 +; SKX-NEXT: retq %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a1 %res = fsub double %x, %a2 @@ -83,11 +95,11 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { } define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { -; CHECK-LABEL: test_x86_fmsub_231_m: -; CHECK: ## BB#0: -; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq +; ALL-LABEL: test_x86_fmsub_231_m: +; ALL: ## BB#0: +; ALL-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1 +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a2 %res = fsub double %x, %a1 @@ -95,21 +107,21 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { } define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind { -; CHECK-LABEL: test231_br: -; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq +; ALL-LABEL: test231_br: +; ALL: ## BB#0: +; ALL-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1 +; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: retq %b1 = fmul <16 x float> %a1, %b2 = fadd <16 x float> %b1, %a2 ret <16 x float> %b2 } define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { -; CHECK-LABEL: test213_br: -; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0 -; CHECK-NEXT: retq +; ALL-LABEL: test213_br: +; ALL: ## BB#0: +; ALL-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0 +; ALL-NEXT: retq %b1 = fmul <16 x float> %a1, %a2 %b2 = fadd <16 x float> %b1, ret <16 x float> %b2 @@ -117,16 +129,17 @@ define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { ;mask (a*c+b , a) define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { -; CHECK-LABEL: test_x86_fmadd132_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 -; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 -; CHECK-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1} -; CHECK-NEXT: retq +; KNL-LABEL: test_x86_fmadd132_ps: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq ; ; SKX-LABEL: test_x86_fmadd132_ps: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 ; SKX-NEXT: vpmovb2m %xmm2, %k1 ; SKX-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq @@ -139,17 +152,18 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <1 ;mask (a*c+b , b) define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { -; CHECK-LABEL: test_x86_fmadd231_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 -; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 -; CHECK-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test_x86_fmadd231_ps: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: retq ; ; SKX-LABEL: test_x86_fmadd231_ps: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 ; SKX-NEXT: vpmovb2m %xmm2, %k1 ; SKX-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 @@ -163,17 +177,18 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1 ;mask (b*a+c , b) define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { -; CHECK-LABEL: test_x86_fmadd213_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 -; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 -; CHECK-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test_x86_fmadd213_ps: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: retq ; ; SKX-LABEL: test_x86_fmadd213_ps: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 ; SKX-NEXT: vpmovb2m %xmm2, %k1 ; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 244d761058c..015c70a6ba0 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1,39 +1,48 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL --check-prefix=CHECK -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX --check-prefix=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX -; CHECK-LABEL: mask16 -; CHECK: kmovw -; CHECK-NEXT: knotw -; CHECK-NEXT: kmovw define i16 @mask16(i16 %x) { +; CHECK-LABEL: mask16: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, %ret = bitcast <16 x i1> %m1 to i16 ret i16 %ret } -; CHECK-LABEL: mask8 -; KNL: kmovw -; KNL-NEXT: knotw -; KNL-NEXT: kmovw -; SKX: kmovb -; SKX-NEXT: knotb -; SKX-NEXT: kmovb - define i8 @mask8(i8 %x) { +; KNL-LABEL: mask8: +; KNL: ## BB#0: +; KNL-NEXT: movzbl %dil, %eax +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: mask8: +; SKX: ## BB#0: +; SKX-NEXT: kmovb %edi, %k0 +; SKX-NEXT: knotb %k0, %k0 +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %ret = bitcast <8 x i1> %m1 to i8 ret i8 %ret } -; CHECK-LABEL: mask16_mem -; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} -; CHECK-NEXT: knotw -; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) -; CHECK: ret - define void @mask16_mem(i16* %ptr) { +; CHECK-LABEL: mask16_mem: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw (%rdi), %k0 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: kmovw %k0, (%rdi) +; CHECK-NEXT: retq %x = load i16, i16* %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, @@ -42,15 +51,20 @@ define void @mask16_mem(i16* %ptr) { ret void } -; CHECK-LABEL: mask8_mem -; KNL: kmovw ([[ARG1]]), %k{{[0-7]}} -; KNL-NEXT: knotw -; KNL-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) -; SKX: kmovb ([[ARG1]]), %k{{[0-7]}} -; SKX-NEXT: knotb -; SKX-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]]) - define void @mask8_mem(i8* %ptr) { +; KNL-LABEL: mask8_mem: +; KNL: ## BB#0: +; KNL-NEXT: kmovw (%rdi), %k0 +; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: mask8_mem: +; SKX: ## BB#0: +; SKX-NEXT: kmovb (%rdi), %k0 +; SKX-NEXT: knotb %k0, %k0 +; SKX-NEXT: kmovb %k0, (%rdi) +; SKX-NEXT: retq %x = load i8, i8* %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, @@ -59,11 +73,16 @@ define void @mask8_mem(i8* %ptr) { ret void } -; CHECK-LABEL: mand16 -; CHECK: kandw -; CHECK: kxorw -; CHECK: korw define i16 @mand16(i16 %x, i16 %y) { +; CHECK-LABEL: mand16: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kandw %k1, %k0, %k2 +; CHECK-NEXT: kxorw %k1, %k0, %k0 +; CHECK-NEXT: korw %k0, %k2, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> %mc = and <16 x i1> %ma, %mb @@ -73,56 +92,68 @@ define i16 @mand16(i16 %x, i16 %y) { ret i16 %ret } -; CHECK-LABEL: shuf_test1 -; CHECK: kshiftrw $8 define i8 @shuf_test1(i16 %v) nounwind { +; KNL-LABEL: shuf_test1: +; KNL: ## BB#0: +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: shuf_test1: +; SKX: ## BB#0: +; SKX-NEXT: kmovw %edi, %k0 +; SKX-NEXT: kshiftrw $8, %k0, %k0 +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: retq %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 ret i8 %mask1 } -; CHECK-LABEL: zext_test1 -; CHECK: kshiftlw -; CHECK: kshiftrw -; CHECK: kmovw - define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: zext_test1: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; CHECK-NEXT: kshiftlw $10, %k0, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i32 ret i32 %res -} - -; CHECK-LABEL: zext_test2 -; CHECK: kshiftlw -; CHECK: kshiftrw -; CHECK: kmovw - -define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { +}define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i16 ret i16 %res -} - -; CHECK-LABEL: zext_test3 -; CHECK: kshiftlw -; CHECK: kshiftrw -; CHECK: kmovw - -define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { +}define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i8 ret i8 %res } -; CHECK-LABEL: conv1 -; KNL: kmovw %k0, %eax -; KNL: movb %al, (%rdi) -; SKX: kmovb %k0, (%rdi) define i8 @conv1(<8 x i1>* %R) { +; KNL-LABEL: conv1: +; KNL: ## BB#0: ## %entry +; KNL-NEXT: kxnorw %k0, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: movb $-2, -{{[0-9]+}}(%rsp) +; KNL-NEXT: movb $-2, %al +; KNL-NEXT: retq +; +; SKX-LABEL: conv1: +; SKX: ## BB#0: ## %entry +; SKX-NEXT: kxnorw %k0, %k0, %k0 +; SKX-NEXT: kmovb %k0, (%rdi) +; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) +; SKX-NEXT: movb $-2, %al +; SKX-NEXT: retq entry: store <8 x i1> , <8 x i1>* %R @@ -133,12 +164,27 @@ entry: ret i8 %mask_convert } -; SKX-LABEL: test4 -; SKX: vpcmpgt -; SKX: knot -; SKX: vpcmpgt -; SKX: vpmovm2d define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) { +; KNL-LABEL: test4: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1 +; KNL-NEXT: vpmovqd %zmm1, %ymm1 +; KNL-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test4: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 +; SKX-NEXT: knotw %k0, %k1 +; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: retq %x_gt_y = icmp sgt <4 x i64> %x, %y %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1 %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1 @@ -146,30 +192,27 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ret <4 x i32> %resse } -; SKX-LABEL: test5 -; SKX: vpcmpgt -; SKX: knot -; SKX: vpcmpgt -; SKX: vpmovm2q define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) { +; KNL-LABEL: test5: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test5: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 +; SKX-NEXT: knotw %k0, %k1 +; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} +; SKX-NEXT: vpmovm2q %k0, %xmm0 +; SKX-NEXT: retq %x_gt_y = icmp slt <2 x i64> %x, %y %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1 %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1 %resse = sext <2 x i1>%res to <2 x i64> ret <2 x i64> %resse -} - -; KNL-LABEL: test6 -; KNL: vpmovsxbd -; KNL: vpandd -; KNL: kmovw %eax, %k1 -; KNL vptestmd {{.*}}, %k0 {%k1} - -; SKX-LABEL: test6 -; SKX: vpmovb2m -; SKX: kmovw %eax, %k1 -; SKX: kandw -define void @test6(<16 x i1> %mask) { +}define void @test6(<16 x i1> %mask) { allocas: %a= and <16 x i1> %mask, %b = bitcast <16 x i1> %a to i16 @@ -182,19 +225,30 @@ true: false: ret void } - -; KNL-LABEL: test7 -; KNL: vpmovsxwq -; KNL: vpandq -; KNL: vptestmq {{.*}}, %k0 -; KNL: korw - -; SKX-LABEL: test7 -; SKX: vpmovw2m -; SKX: kmovb %eax, %k1 -; SKX: korb - define void @test7(<8 x i1> %mask) { +; KNL-LABEL: test7: +; KNL: ## BB#0: ## %allocas +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: movb $85, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb %al, %al +; KNL-NEXT: retq +; +; SKX-LABEL: test7: +; SKX: ## BB#0: ## %allocas +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k0 +; SKX-NEXT: movb $85, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: retq allocas: %a= or <8 x i1> %mask, %b = bitcast <8 x i1> %a to i8 @@ -207,22 +261,35 @@ true: false: ret void } - -; KNL-LABEL: test8 -; KNL: vpxord %zmm2, %zmm2, %zmm2 -; KNL: jg -; KNL: vpcmpltud %zmm2, %zmm1, %k1 -; KNL: jmp -; KNL: vpcmpgtd %zmm2, %zmm0, %k1 - -; SKX-LABEL: test8 -; SKX: jg -; SKX: vpcmpltud {{.*}}, %k0 -; SKX: vpmovm2b -; SKX: vpcmpgtd {{.*}}, %k0 -; SKX: vpmovm2b - define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { +; KNL-LABEL: test8: +; KNL: ## BB#0: +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: jg LBB14_1 +; KNL-NEXT: ## BB#2: +; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1 +; KNL-NEXT: jmp LBB14_3 +; KNL-NEXT: LBB14_1: +; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; KNL-NEXT: LBB14_3: +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test8: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: jg LBB14_1 +; SKX-NEXT: ## BB#2: +; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: LBB14_1: +; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: retq %cond = icmp sgt i32 %a1, %b1 %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer %cmp2 = icmp ult <16 x i32> %b, zeroinitializer @@ -230,91 +297,121 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { %res = sext <16 x i1> %mix to <16 x i8> ret <16 x i8> %res } - -; KNL-LABEL: test9 -; KNL: jg -; KNL: vpmovsxbd %xmm1, %zmm0 -; KNL: jmp -; KNL: vpmovsxbd %xmm0, %zmm0 - -; SKX-LABEL: test9 -; SKX: vpmovb2m %xmm1, %k0 -; SKX: vpmovm2b %k0, %xmm0 -; SKX: retq -; SKX: vpmovb2m %xmm0, %k0 -; SKX: vpmovm2b %k0, %xmm0 - define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { +; KNL-LABEL: test9: +; KNL: ## BB#0: +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: jg LBB15_1 +; KNL-NEXT: ## BB#2: +; KNL-NEXT: vpmovsxbd %xmm1, %zmm0 +; KNL-NEXT: jmp LBB15_3 +; KNL-NEXT: LBB15_1: +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: LBB15_3: +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test9: +; SKX: ## BB#0: +; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: jg LBB15_1 +; SKX-NEXT: ## BB#2: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm0 +; SKX-NEXT: jmp LBB15_3 +; SKX-NEXT: LBB15_1: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: LBB15_3: +; SKX-NEXT: vpmovb2m %xmm0, %k0 +; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: retq %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b ret <16 x i1>%c -} - -; KNL-LABEL: test10 -; KNL: jg -; KNL: vpmovsxwq %xmm1, %zmm0 -; KNL: jmp -; KNL: vpmovsxwq %xmm0, %zmm0 - -; SKX-LABEL: test10 -; SKX: jg -; SKX: vpmovw2m %xmm1, %k0 -; SKX: vpmovm2w %k0, %xmm0 -; SKX: retq -; SKX: vpmovw2m %xmm0, %k0 -; SKX: vpmovm2w %k0, %xmm0 -define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) { +}define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) { %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <8 x i1>%a, <8 x i1>%b ret <8 x i1>%c } -; SKX-LABEL: test11 -; SKX: jg -; SKX: vpmovd2m %xmm1, %k0 -; SKX: vpmovm2d %k0, %xmm0 -; SKX: retq -; SKX: vpmovd2m %xmm0, %k0 -; SKX: vpmovm2d %k0, %xmm0 define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) { +; KNL-LABEL: test11: +; KNL: ## BB#0: +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: jg LBB17_2 +; KNL-NEXT: ## BB#1: +; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: LBB17_2: +; KNL-NEXT: retq +; +; SKX-LABEL: test11: +; SKX: ## BB#0: +; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: jg LBB17_1 +; SKX-NEXT: ## BB#2: +; SKX-NEXT: vpslld $31, %xmm1, %xmm0 +; SKX-NEXT: jmp LBB17_3 +; SKX-NEXT: LBB17_1: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: LBB17_3: +; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: retq %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b ret <4 x i1>%c } -; KNL-LABEL: test12 -; KNL: movl %edi, %eax define i32 @test12(i32 %x, i32 %y) { +; CHECK-LABEL: test12: +; CHECK: ## BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 0 %c = select i1 %b, i32 %x, i32 %y ret i32 %c } -; KNL-LABEL: test13 -; KNL: movl %esi, %eax define i32 @test13(i32 %x, i32 %y) { +; CHECK-LABEL: test13: +; CHECK: ## BB#0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 3 %c = select i1 %b, i32 %x, i32 %y ret i32 %c -} - -; SKX-LABEL: test14 -; SKX: movb $11, %al -; SKX: kmovb %eax, %k0 -; SKX: vpmovm2d %k0, %xmm0 - -define <4 x i1> @test14() { +}define <4 x i1> @test14() { %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 2 %c = insertelement <4 x i1> , i1 %b, i32 1 ret <4 x i1> %c } -; KNL-LABEL: test15 -; KNL: cmovgw define <16 x i1> @test15(i32 %x, i32 %y) { +; KNL-LABEL: test15: +; KNL: ## BB#0: +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: movw $21845, %ax ## imm = 0x5555 +; KNL-NEXT: movw $1, %cx +; KNL-NEXT: cmovgw %ax, %cx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test15: +; SKX: ## BB#0: +; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: movw $21845, %ax ## imm = 0x5555 +; SKX-NEXT: movw $1, %cx +; SKX-NEXT: cmovgw %ax, %cx +; SKX-NEXT: kmovw %ecx, %k0 +; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: retq %a = bitcast i16 21845 to <16 x i1> %b = bitcast i16 1 to <16 x i1> %mask = icmp sgt i32 %x, %y @@ -322,27 +419,914 @@ define <16 x i1> @test15(i32 %x, i32 %y) { ret <16 x i1> %c } -; SKX-LABEL: test16 -; SKX: kxnorw %k0, %k0, %k1 -; SKX: kshiftrw $15, %k1, %k1 -; SKX: kshiftlq $5, %k1, %k1 -; SKX: korq %k1, %k0, %k0 -; SKX: vpmovm2b %k0, %zmm0 define <64 x i8> @test16(i64 %x) { +; KNL-LABEL: test16: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Ltmp0: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Ltmp1: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Ltmp2: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: pushq %r15 +; KNL-NEXT: pushq %r14 +; KNL-NEXT: pushq %r13 +; KNL-NEXT: pushq %r12 +; KNL-NEXT: pushq %rbx +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: Ltmp3: +; KNL-NEXT: .cfi_offset %rbx, -56 +; KNL-NEXT: Ltmp4: +; KNL-NEXT: .cfi_offset %r12, -48 +; KNL-NEXT: Ltmp5: +; KNL-NEXT: .cfi_offset %r13, -40 +; KNL-NEXT: Ltmp6: +; KNL-NEXT: .cfi_offset %r14, -32 +; KNL-NEXT: Ltmp7: +; KNL-NEXT: .cfi_offset %r15, -24 +; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: shrq $32, %rax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl $271, %eax ## imm = 0x10F +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: movl %edi, %ecx +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: movl $257, %ecx ## imm = 0x101 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $258, %ecx ## imm = 0x102 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $259, %ecx ## imm = 0x103 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $260, %ecx ## imm = 0x104 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $261, %ecx ## imm = 0x105 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $262, %ecx ## imm = 0x106 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $263, %ecx ## imm = 0x107 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $264, %ecx ## imm = 0x108 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $265, %ecx ## imm = 0x109 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $266, %ecx ## imm = 0x10A +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $267, %ecx ## imm = 0x10B +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $268, %ecx ## imm = 0x10C +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $269, %ecx ## imm = 0x10D +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; KNL-NEXT: movl $270, %ecx ## imm = 0x10E +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 +; KNL-NEXT: movl $1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0 +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; KNL-NEXT: movq %r15, %rdx +; KNL-NEXT: shrq $17, %rdx +; KNL-NEXT: andb $1, %dl +; KNL-NEXT: je LBB22_2 +; KNL-NEXT: ## BB#1: +; KNL-NEXT: movb $-1, %dl +; KNL-NEXT: LBB22_2: +; KNL-NEXT: movq %r15, %r11 +; KNL-NEXT: shrq $16, %r11 +; KNL-NEXT: andb $1, %r11b +; KNL-NEXT: je LBB22_4 +; KNL-NEXT: ## BB#3: +; KNL-NEXT: movb $-1, %r11b +; KNL-NEXT: LBB22_4: +; KNL-NEXT: movq %r15, %r10 +; KNL-NEXT: shrq $18, %r10 +; KNL-NEXT: andb $1, %r10b +; KNL-NEXT: je LBB22_6 +; KNL-NEXT: ## BB#5: +; KNL-NEXT: movb $-1, %r10b +; KNL-NEXT: LBB22_6: +; KNL-NEXT: movq %r15, %r9 +; KNL-NEXT: shrq $19, %r9 +; KNL-NEXT: andb $1, %r9b +; KNL-NEXT: je LBB22_8 +; KNL-NEXT: ## BB#7: +; KNL-NEXT: movb $-1, %r9b +; KNL-NEXT: LBB22_8: +; KNL-NEXT: movq %r15, %rbx +; KNL-NEXT: shrq $20, %rbx +; KNL-NEXT: andb $1, %bl +; KNL-NEXT: je LBB22_10 +; KNL-NEXT: ## BB#9: +; KNL-NEXT: movb $-1, %bl +; KNL-NEXT: LBB22_10: +; KNL-NEXT: movq %r15, %r12 +; KNL-NEXT: shrq $21, %r12 +; KNL-NEXT: andb $1, %r12b +; KNL-NEXT: je LBB22_12 +; KNL-NEXT: ## BB#11: +; KNL-NEXT: movb $-1, %r12b +; KNL-NEXT: LBB22_12: +; KNL-NEXT: movq %r15, %r14 +; KNL-NEXT: shrq $22, %r14 +; KNL-NEXT: andb $1, %r14b +; KNL-NEXT: je LBB22_14 +; KNL-NEXT: ## BB#13: +; KNL-NEXT: movb $-1, %r14b +; KNL-NEXT: LBB22_14: +; KNL-NEXT: movq %r15, %r8 +; KNL-NEXT: shrq $23, %r8 +; KNL-NEXT: andb $1, %r8b +; KNL-NEXT: je LBB22_16 +; KNL-NEXT: ## BB#15: +; KNL-NEXT: movb $-1, %r8b +; KNL-NEXT: LBB22_16: +; KNL-NEXT: movq %r15, %r13 +; KNL-NEXT: shrq $24, %r13 +; KNL-NEXT: andb $1, %r13b +; KNL-NEXT: je LBB22_18 +; KNL-NEXT: ## BB#17: +; KNL-NEXT: movb $-1, %r13b +; KNL-NEXT: LBB22_18: +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $25, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_20 +; KNL-NEXT: ## BB#19: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_20: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $26, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_22 +; KNL-NEXT: ## BB#21: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_22: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movl $272, %esi ## imm = 0x110 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $27, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_24 +; KNL-NEXT: ## BB#23: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_24: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movl $273, %eax ## imm = 0x111 +; KNL-NEXT: bextrl %esi, %edi, %esi +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shrq $28, %rcx +; KNL-NEXT: andb $1, %cl +; KNL-NEXT: je LBB22_26 +; KNL-NEXT: ## BB#25: +; KNL-NEXT: movb $-1, %cl +; KNL-NEXT: LBB22_26: +; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vmovd %esi, %xmm2 +; KNL-NEXT: movl $274, %esi ## imm = 0x112 +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shrq $29, %rcx +; KNL-NEXT: andb $1, %cl +; KNL-NEXT: je LBB22_28 +; KNL-NEXT: ## BB#27: +; KNL-NEXT: movb $-1, %cl +; KNL-NEXT: LBB22_28: +; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; KNL-NEXT: bextrl %esi, %edi, %eax +; KNL-NEXT: movzbl %r11b, %esi +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shrq $30, %rcx +; KNL-NEXT: andb $1, %cl +; KNL-NEXT: je LBB22_30 +; KNL-NEXT: ## BB#29: +; KNL-NEXT: movb $-1, %cl +; KNL-NEXT: LBB22_30: +; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; KNL-NEXT: movl $275, %eax ## imm = 0x113 +; KNL-NEXT: bextrl %eax, %edi, %r11d +; KNL-NEXT: movzbl %dl, %edx +; KNL-NEXT: vmovd %esi, %xmm3 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $31, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_32 +; KNL-NEXT: ## BB#31: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_32: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 +; KNL-NEXT: movl $276, %eax ## imm = 0x114 +; KNL-NEXT: bextrl %eax, %edi, %esi +; KNL-NEXT: movl $277, %r11d ## imm = 0x115 +; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r10b, %r10d +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_34 +; KNL-NEXT: ## BB#33: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_34: +; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2 +; KNL-NEXT: bextrl %r11d, %edi, %edx +; KNL-NEXT: movl $278, %r11d ## imm = 0x116 +; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r9b, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shlq $63, %rcx +; KNL-NEXT: sarq $63, %rcx +; KNL-NEXT: vmovd %ecx, %xmm4 +; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $2, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_36 +; KNL-NEXT: ## BB#35: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_36: +; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %r11d, %edi, %edx +; KNL-NEXT: movl $279, %r9d ## imm = 0x117 +; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3 +; KNL-NEXT: movzbl %bl, %ebx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $3, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_38 +; KNL-NEXT: ## BB#37: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_38: +; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %r9d, %edi, %edx +; KNL-NEXT: movl $280, %esi ## imm = 0x118 +; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r12b, %ebx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $4, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_40 +; KNL-NEXT: ## BB#39: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_40: +; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %esi, %edi, %ecx +; KNL-NEXT: movl $281, %edx ## imm = 0x119 +; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r14b, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $5, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_42 +; KNL-NEXT: ## BB#41: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_42: +; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %ecx +; KNL-NEXT: movl $282, %edx ## imm = 0x11A +; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r8b, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %bl +; KNL-NEXT: shrb $6, %bl +; KNL-NEXT: andb $1, %bl +; KNL-NEXT: je LBB22_44 +; KNL-NEXT: ## BB#43: +; KNL-NEXT: movb $-1, %bl +; KNL-NEXT: LBB22_44: +; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %eax +; KNL-NEXT: movl $283, %ecx ## imm = 0x11B +; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r13b, %esi +; KNL-NEXT: movzbl %bl, %edx +; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %bl +; KNL-NEXT: shrb $7, %bl +; KNL-NEXT: je LBB22_46 +; KNL-NEXT: ## BB#45: +; KNL-NEXT: movb $-1, %bl +; KNL-NEXT: LBB22_46: +; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: movl $284, %edx ## imm = 0x11C +; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload +; KNL-NEXT: movzbl %al, %esi +; KNL-NEXT: movzbl %bl, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $8, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_48 +; KNL-NEXT: ## BB#47: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_48: +; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %ecx +; KNL-NEXT: movl $285, %edx ## imm = 0x11D +; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload +; KNL-NEXT: movzbl %sil, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $9, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_50 +; KNL-NEXT: ## BB#49: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_50: +; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %ecx +; KNL-NEXT: movl $286, %edx ## imm = 0x11E +; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload +; KNL-NEXT: movzbl %sil, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $10, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_52 +; KNL-NEXT: ## BB#51: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_52: +; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %edx +; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $11, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_54 +; KNL-NEXT: ## BB#53: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_54: +; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 +; KNL-NEXT: shrl $31, %edi +; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $12, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_56 +; KNL-NEXT: ## BB#55: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_56: +; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $13, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_58 +; KNL-NEXT: ## BB#57: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_58: +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $14, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB22_60 +; KNL-NEXT: ## BB#59: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB22_60: +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1 +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2 +; KNL-NEXT: shrq $15, %r15 +; KNL-NEXT: andb $1, %r15b +; KNL-NEXT: je LBB22_62 +; KNL-NEXT: ## BB#61: +; KNL-NEXT: movb $-1, %r15b +; KNL-NEXT: LBB22_62: +; KNL-NEXT: movzbl %r15b, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; KNL-NEXT: leaq -40(%rbp), %rsp +; KNL-NEXT: popq %rbx +; KNL-NEXT: popq %r12 +; KNL-NEXT: popq %r13 +; KNL-NEXT: popq %r14 +; KNL-NEXT: popq %r15 +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test16: +; SKX: ## BB#0: +; SKX-NEXT: kmovq %rdi, %k0 +; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: kshiftrw $15, %k1, %k1 +; SKX-NEXT: kshiftlq $5, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 +; SKX-NEXT: vpmovm2b %k0, %zmm0 +; SKX-NEXT: retq %a = bitcast i64 %x to <64 x i1> %b = insertelement <64 x i1>%a, i1 true, i32 5 %c = sext <64 x i1>%b to <64 x i8> ret <64 x i8>%c } -; SKX-LABEL: test17 -; SKX: setg %al -; SKX: andl $1, %eax -; SKX: kmovw %eax, %k1 -; SKX: kshiftlq $5, %k1, %k1 -; SKX: korq %k1, %k0, %k0 -; SKX: vpmovm2b %k0, %zmm0 define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { +; KNL-LABEL: test17: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Ltmp8: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Ltmp9: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Ltmp10: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: pushq %r15 +; KNL-NEXT: pushq %r14 +; KNL-NEXT: pushq %r13 +; KNL-NEXT: pushq %r12 +; KNL-NEXT: pushq %rbx +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: Ltmp11: +; KNL-NEXT: .cfi_offset %rbx, -56 +; KNL-NEXT: Ltmp12: +; KNL-NEXT: .cfi_offset %r12, -48 +; KNL-NEXT: Ltmp13: +; KNL-NEXT: .cfi_offset %r13, -40 +; KNL-NEXT: Ltmp14: +; KNL-NEXT: .cfi_offset %r14, -32 +; KNL-NEXT: Ltmp15: +; KNL-NEXT: .cfi_offset %r15, -24 +; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: shrq $32, %rax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl %edi, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: movl $257, %eax ## imm = 0x101 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $258, %eax ## imm = 0x102 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $259, %eax ## imm = 0x103 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $260, %eax ## imm = 0x104 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $261, %eax ## imm = 0x105 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $262, %eax ## imm = 0x106 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $263, %eax ## imm = 0x107 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $264, %eax ## imm = 0x108 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $265, %eax ## imm = 0x109 +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $266, %eax ## imm = 0x10A +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $267, %eax ## imm = 0x10B +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $268, %eax ## imm = 0x10C +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $269, %eax ## imm = 0x10D +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $270, %eax ## imm = 0x10E +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: movl $271, %eax ## imm = 0x10F +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 +; KNL-NEXT: cmpl %edx, %esi +; KNL-NEXT: setg %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0 +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; KNL-NEXT: movq %r15, %rdx +; KNL-NEXT: shrq $17, %rdx +; KNL-NEXT: andb $1, %dl +; KNL-NEXT: je LBB23_2 +; KNL-NEXT: ## BB#1: +; KNL-NEXT: movb $-1, %dl +; KNL-NEXT: LBB23_2: +; KNL-NEXT: movq %r15, %r11 +; KNL-NEXT: shrq $16, %r11 +; KNL-NEXT: andb $1, %r11b +; KNL-NEXT: je LBB23_4 +; KNL-NEXT: ## BB#3: +; KNL-NEXT: movb $-1, %r11b +; KNL-NEXT: LBB23_4: +; KNL-NEXT: movq %r15, %r10 +; KNL-NEXT: shrq $18, %r10 +; KNL-NEXT: andb $1, %r10b +; KNL-NEXT: je LBB23_6 +; KNL-NEXT: ## BB#5: +; KNL-NEXT: movb $-1, %r10b +; KNL-NEXT: LBB23_6: +; KNL-NEXT: movq %r15, %r9 +; KNL-NEXT: shrq $19, %r9 +; KNL-NEXT: andb $1, %r9b +; KNL-NEXT: je LBB23_8 +; KNL-NEXT: ## BB#7: +; KNL-NEXT: movb $-1, %r9b +; KNL-NEXT: LBB23_8: +; KNL-NEXT: movq %r15, %rbx +; KNL-NEXT: shrq $20, %rbx +; KNL-NEXT: andb $1, %bl +; KNL-NEXT: je LBB23_10 +; KNL-NEXT: ## BB#9: +; KNL-NEXT: movb $-1, %bl +; KNL-NEXT: LBB23_10: +; KNL-NEXT: movq %r15, %r12 +; KNL-NEXT: shrq $21, %r12 +; KNL-NEXT: andb $1, %r12b +; KNL-NEXT: je LBB23_12 +; KNL-NEXT: ## BB#11: +; KNL-NEXT: movb $-1, %r12b +; KNL-NEXT: LBB23_12: +; KNL-NEXT: movq %r15, %r14 +; KNL-NEXT: shrq $22, %r14 +; KNL-NEXT: andb $1, %r14b +; KNL-NEXT: je LBB23_14 +; KNL-NEXT: ## BB#13: +; KNL-NEXT: movb $-1, %r14b +; KNL-NEXT: LBB23_14: +; KNL-NEXT: movq %r15, %r8 +; KNL-NEXT: shrq $23, %r8 +; KNL-NEXT: andb $1, %r8b +; KNL-NEXT: je LBB23_16 +; KNL-NEXT: ## BB#15: +; KNL-NEXT: movb $-1, %r8b +; KNL-NEXT: LBB23_16: +; KNL-NEXT: movq %r15, %r13 +; KNL-NEXT: shrq $24, %r13 +; KNL-NEXT: andb $1, %r13b +; KNL-NEXT: je LBB23_18 +; KNL-NEXT: ## BB#17: +; KNL-NEXT: movb $-1, %r13b +; KNL-NEXT: LBB23_18: +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $25, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_20 +; KNL-NEXT: ## BB#19: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_20: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $26, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_22 +; KNL-NEXT: ## BB#21: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_22: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movl $272, %esi ## imm = 0x110 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $27, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_24 +; KNL-NEXT: ## BB#23: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_24: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movl $273, %eax ## imm = 0x111 +; KNL-NEXT: bextrl %esi, %edi, %esi +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shrq $28, %rcx +; KNL-NEXT: andb $1, %cl +; KNL-NEXT: je LBB23_26 +; KNL-NEXT: ## BB#25: +; KNL-NEXT: movb $-1, %cl +; KNL-NEXT: LBB23_26: +; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: bextrl %eax, %edi, %eax +; KNL-NEXT: vmovd %esi, %xmm2 +; KNL-NEXT: movl $274, %esi ## imm = 0x112 +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shrq $29, %rcx +; KNL-NEXT: andb $1, %cl +; KNL-NEXT: je LBB23_28 +; KNL-NEXT: ## BB#27: +; KNL-NEXT: movb $-1, %cl +; KNL-NEXT: LBB23_28: +; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; KNL-NEXT: bextrl %esi, %edi, %eax +; KNL-NEXT: movzbl %r11b, %esi +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shrq $30, %rcx +; KNL-NEXT: andb $1, %cl +; KNL-NEXT: je LBB23_30 +; KNL-NEXT: ## BB#29: +; KNL-NEXT: movb $-1, %cl +; KNL-NEXT: LBB23_30: +; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; KNL-NEXT: movl $275, %eax ## imm = 0x113 +; KNL-NEXT: bextrl %eax, %edi, %r11d +; KNL-NEXT: movzbl %dl, %edx +; KNL-NEXT: vmovd %esi, %xmm3 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $31, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_32 +; KNL-NEXT: ## BB#31: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_32: +; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill +; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 +; KNL-NEXT: movl $276, %eax ## imm = 0x114 +; KNL-NEXT: bextrl %eax, %edi, %esi +; KNL-NEXT: movl $277, %r11d ## imm = 0x115 +; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r10b, %r10d +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_34 +; KNL-NEXT: ## BB#33: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_34: +; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2 +; KNL-NEXT: bextrl %r11d, %edi, %edx +; KNL-NEXT: movl $278, %r11d ## imm = 0x116 +; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r9b, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: movq %r15, %rcx +; KNL-NEXT: shlq $63, %rcx +; KNL-NEXT: sarq $63, %rcx +; KNL-NEXT: vmovd %ecx, %xmm4 +; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $2, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_36 +; KNL-NEXT: ## BB#35: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_36: +; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %r11d, %edi, %edx +; KNL-NEXT: movl $279, %r9d ## imm = 0x117 +; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3 +; KNL-NEXT: movzbl %bl, %ebx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $3, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_38 +; KNL-NEXT: ## BB#37: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_38: +; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %r9d, %edi, %edx +; KNL-NEXT: movl $280, %esi ## imm = 0x118 +; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r12b, %ebx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $4, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_40 +; KNL-NEXT: ## BB#39: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_40: +; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %esi, %edi, %ecx +; KNL-NEXT: movl $281, %edx ## imm = 0x119 +; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r14b, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %al +; KNL-NEXT: shrb $5, %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_42 +; KNL-NEXT: ## BB#41: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_42: +; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %ecx +; KNL-NEXT: movl $282, %edx ## imm = 0x11A +; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r8b, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %bl +; KNL-NEXT: shrb $6, %bl +; KNL-NEXT: andb $1, %bl +; KNL-NEXT: je LBB23_44 +; KNL-NEXT: ## BB#43: +; KNL-NEXT: movb $-1, %bl +; KNL-NEXT: LBB23_44: +; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %eax +; KNL-NEXT: movl $283, %ecx ## imm = 0x11B +; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3 +; KNL-NEXT: movzbl %r13b, %esi +; KNL-NEXT: movzbl %bl, %edx +; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; KNL-NEXT: movb %r15b, %bl +; KNL-NEXT: shrb $7, %bl +; KNL-NEXT: je LBB23_46 +; KNL-NEXT: ## BB#45: +; KNL-NEXT: movb $-1, %bl +; KNL-NEXT: LBB23_46: +; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; KNL-NEXT: bextrl %ecx, %edi, %ecx +; KNL-NEXT: movl $284, %edx ## imm = 0x11C +; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload +; KNL-NEXT: movzbl %al, %esi +; KNL-NEXT: movzbl %bl, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $8, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_48 +; KNL-NEXT: ## BB#47: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_48: +; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %ecx +; KNL-NEXT: movl $285, %edx ## imm = 0x11D +; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload +; KNL-NEXT: movzbl %sil, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $9, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_50 +; KNL-NEXT: ## BB#49: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_50: +; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %ecx +; KNL-NEXT: movl $286, %edx ## imm = 0x11E +; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload +; KNL-NEXT: movzbl %sil, %esi +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $10, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_52 +; KNL-NEXT: ## BB#51: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_52: +; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; KNL-NEXT: bextrl %edx, %edi, %edx +; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $11, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_54 +; KNL-NEXT: ## BB#53: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_54: +; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 +; KNL-NEXT: shrl $31, %edi +; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $12, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_56 +; KNL-NEXT: ## BB#55: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_56: +; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $13, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_58 +; KNL-NEXT: ## BB#57: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_58: +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2 +; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; KNL-NEXT: movzbl %cl, %ecx +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3 +; KNL-NEXT: movq %r15, %rax +; KNL-NEXT: shrq $14, %rax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: je LBB23_60 +; KNL-NEXT: ## BB#59: +; KNL-NEXT: movb $-1, %al +; KNL-NEXT: LBB23_60: +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1 +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2 +; KNL-NEXT: shrq $15, %r15 +; KNL-NEXT: andb $1, %r15b +; KNL-NEXT: je LBB23_62 +; KNL-NEXT: ## BB#61: +; KNL-NEXT: movb $-1, %r15b +; KNL-NEXT: LBB23_62: +; KNL-NEXT: movzbl %r15b, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; KNL-NEXT: leaq -40(%rbp), %rsp +; KNL-NEXT: popq %rbx +; KNL-NEXT: popq %r12 +; KNL-NEXT: popq %r13 +; KNL-NEXT: popq %r14 +; KNL-NEXT: popq %r15 +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test17: +; SKX: ## BB#0: +; SKX-NEXT: kmovq %rdi, %k0 +; SKX-NEXT: cmpl %edx, %esi +; SKX-NEXT: setg %al +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: kshiftlq $5, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 +; SKX-NEXT: vpmovm2b %k0, %zmm0 +; SKX-NEXT: retq %a = bitcast i64 %x to <64 x i1> %b = icmp sgt i32 %y, %z %c = insertelement <64 x i1>%a, i1 %b, i32 5 @@ -350,8 +1334,38 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ret <64 x i8>%d } -; KNL-LABEL: test18 define <8 x i1> @test18(i8 %a, i16 %y) { +; KNL-LABEL: test18: +; KNL: ## BB#0: +; KNL-NEXT: movzbl %dil, %eax +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test18: +; SKX: ## BB#0: +; SKX-NEXT: kmovb %edi, %k0 +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: kshiftlw $6, %k1, %k2 +; SKX-NEXT: kshiftrw $15, %k2, %k2 +; SKX-NEXT: kshiftlw $7, %k1, %k1 +; SKX-NEXT: kshiftrw $15, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: korb %k2, %k0, %k0 +; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: retq %b = bitcast i8 %a to <8 x i1> %b1 = bitcast i16 %y to <16 x i1> %el1 = extractelement <16 x i1>%b1, i32 8 @@ -360,31 +1374,76 @@ define <8 x i1> @test18(i8 %a, i16 %y) { %d = insertelement <8 x i1>%c, i1 %el2, i32 6 ret <8 x i1>%d } - -; KNL-LABEL: test21 -; KNL: vpand %ymm -; KNL: vextracti128 $1, %ymm2 -; KNL: vpand %ymm - -; SKX-LABEL: test21 -; SKX: vpmovb2m -; SKX: vmovdqu16 {{.*}}%k1 - define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { +; KNL-LABEL: test21: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vpsllw $15, %ymm3, %ymm3 +; KNL-NEXT: vpsraw $15, %ymm3, %ymm3 +; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vpsllw $15, %ymm2, %ymm2 +; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 +; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: test21: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 +; SKX-NEXT: vpmovb2m %ymm1, %k1 +; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; SKX-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } -; SKX-LABEL: test22 -; SKX: kmovb define void @test22(<4 x i1> %a, <4 x i1>* %addr) { +; KNL-LABEL: test22: +; KNL: ## BB#0: +; KNL-NEXT: vpextrd $3, %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vpextrd $2, %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vpextrd $1, %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: test22: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: kmovb %k0, (%rdi) +; SKX-NEXT: retq store <4 x i1> %a, <4 x i1>* %addr ret void } -; SKX-LABEL: test23 -; SKX: kmovb define void @test23(<2 x i1> %a, <2 x i1>* %addr) { +; KNL-LABEL: test23: +; KNL: ## BB#0: +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: test23: +; SKX: ## BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpmovq2m %xmm0, %k0 +; SKX-NEXT: kmovb %k0, (%rdi) +; SKX-NEXT: retq store <2 x i1> %a, <2 x i1>* %addr ret void } diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 4bc1ae13a91..c54010cd91b 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -1,8 +1,10 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512dq -mattr=+avx512vl| FileCheck %s define <8 x i1> @test(<2 x i1> %a) { ; CHECK-LABEL: test: ; CHECK: # BB#0: +; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vpmovq2m %xmm0, %k0 ; CHECK-NEXT: kshiftlb $2, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 @@ -14,6 +16,7 @@ define <8 x i1> @test(<2 x i1> %a) { define <8 x i1> @test1(<2 x i1> %a) { ; CHECK-LABEL: test1: ; CHECK: # BB#0: +; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vpmovq2m %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 @@ -25,10 +28,12 @@ define <8 x i1> @test1(<2 x i1> %a) { define <8 x i1> @test2(<2 x i1> %a) { ; CHECK-LABEL: test2: ; CHECK: # BB#0: +; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vpmovq2m %xmm0, %k0 ; CHECK-NEXT: vpmovm2q %k0, %zmm0 ; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0 ; CHECK-NEXT: vpmovq2m %zmm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq @@ -39,6 +44,7 @@ define <8 x i1> @test2(<2 x i1> %a) { define <8 x i1> @test3(<4 x i1> %a) { ; CHECK-LABEL: test3: ; CHECK: # BB#0: +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-NEXT: vpmovd2m %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: kshiftrb $4, %k0, %k0 @@ -52,6 +58,7 @@ define <8 x i1> @test3(<4 x i1> %a) { define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { ; CHECK-LABEL: test4: ; CHECK: # BB#0: +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-NEXT: vpmovd2m %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: kshiftrb $4, %k0, %k1 @@ -66,6 +73,7 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) { ; CHECK-LABEL: test5: ; CHECK: # BB#0: +; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vpmovq2m %xmm0, %k0 ; CHECK-NEXT: kshiftlw $2, %k0, %k0 ; CHECK-NEXT: kshiftrw $2, %k0, %k1 @@ -80,6 +88,7 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) { define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) { ; CHECK-LABEL: test6: ; CHECK: # BB#0: +; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vpmovq2m %xmm0, %k0 ; CHECK-NEXT: kshiftlw $2, %k0, %k0 ; CHECK-NEXT: kshiftrw $2, %k0, %k1 @@ -95,6 +104,7 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) { define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) { ; CHECK-LABEL: test7: ; CHECK: # BB#0: +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-NEXT: vpmovd2m %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: kshiftrb $4, %k0, %k1 @@ -111,7 +121,9 @@ define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) { define <64 x i1> @test8(<8 x i1> %a, <8 x i1>%b) { ; CHECK-LABEL: test8: ; CHECK: # BB#0: +; CHECK-NEXT: vpsllw $15, %xmm1, %xmm1 ; CHECK-NEXT: vpmovw2m %xmm1, %k0 +; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 ; CHECK-NEXT: vpmovw2m %xmm0, %k1 ; CHECK-NEXT: kunpckdq %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %zmm0 diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 769387e80a2..73b5666d443 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2787,6 +2787,48 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { ret i64 %res } +declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>) + +define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) { +; AVX512BW-LABEL: test_int_x86_avx512_cvtb2mask_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: subl $12, %esp +; AVX512F-32-NEXT: .Ltmp9: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: kmovq %k0, (%esp) +; AVX512F-32-NEXT: movl (%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: retl + %res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0) + ret i64 %res +} + +declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>) + +define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) { +; AVX512BW-LABEL: test_int_x86_avx512_cvtw2mask_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_cvtw2mask_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vpmovw2m %zmm0, %k0 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: retl + %res = call i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16> %x0) + ret i32 %res +} + declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64) define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) { diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 56d24a3c800..17fbeee0bbe 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -4413,6 +4413,54 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> ret <32 x i16> %res4 } +declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>) + +define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovb2m %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0) + ret i16 %res +} + +declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>) + +define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovb2m %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0) + ret i32 %res +} + +declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>) + +define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovw2m %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0) + ret i8 %res +} + +declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>) + +define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0) + ret i16 %res +} + declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16) define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) { diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index e2d2ddcbec8..19cf368cc4d 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -542,6 +542,30 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 ret <16 x i32> %res4 } +declare i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32>) + +define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovd2m %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0) + ret i16 %res +} + +declare i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64>) + +define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovq2m %zmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0) + ret i8 %res +} + declare <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16) define <16 x i32>@test_int_x86_avx512_cvtmask2d_512(i16 %x0) { @@ -565,4 +589,3 @@ define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) { %res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0) ret <8 x i64> %res } - diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index 1612ea40d5c..a6d517c10cd 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -1833,6 +1833,54 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x ret <4 x i32> %res4 } +declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>) + +define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovd2m %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32>) + +define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovd2m %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>) + +define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovq2m %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>) + +define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovq2m %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0) + ret i8 %res +} + declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8) define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) { diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 8578c76d5f1..b7280d87d3b 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -681,7 +681,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; KNL_64-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} ; KNL_64-NEXT: retq @@ -693,17 +693,27 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0 -; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0 +; KNL_32-NEXT: vpsllvq .LCPI14_0, %zmm0, %zmm0 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test15: ; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vpmovd2m %xmm1, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test15: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX_32-NEXT: vpmovd2m %xmm1, %k1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <4 x i32> %ind to <4 x i64> %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind @@ -722,7 +732,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 @@ -737,7 +747,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 @@ -745,10 +755,20 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; ; SKX-LABEL: test16: ; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vpmovd2m %xmm1, %k1 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test16: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX_32-NEXT: vpmovd2m %xmm1, %k1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <4 x i32> %ind to <4 x i64> %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind @@ -762,7 +782,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; KNL_64: # BB#0: ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 @@ -773,7 +793,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 @@ -781,10 +801,20 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; ; SKX-LABEL: test17: ; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test17: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 +; SKX_32-NEXT: vpmovq2m %xmm1, %k1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind @@ -805,7 +835,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq @@ -816,16 +846,24 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq .LCPI17_0, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test18: ; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm2, %xmm2 ; SKX-NEXT: vpmovd2m %xmm2, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test18: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 +; SKX_32-NEXT: vpmovd2m %xmm2, %k1 +; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} +; SKX_32-NEXT: retl call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) ret void } @@ -839,7 +877,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} ; KNL_64-NEXT: retq @@ -852,19 +890,21 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq .LCPI18_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test19: ; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vpmovd2m %xmm1, %k1 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test19: ; SKX_32: # BB#0: +; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} @@ -880,11 +920,11 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; KNL_64-LABEL: test20: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; KNL_64-NEXT: vmovq %xmm2, %xmm2 +; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq @@ -892,24 +932,35 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; KNL_32-LABEL: test20: ; KNL_32: # BB#0: ; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; KNL_32-NEXT: vmovq %xmm2, %xmm2 +; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq .LCPI19_0, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test20: ; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k0 ; SKX-NEXT: kshiftlw $2, %k0, %k0 ; SKX-NEXT: kshiftrw $2, %k0, %k1 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test20: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 +; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kshiftlw $2, %k0, %k0 +; SKX_32-NEXT: kshiftrw $2, %k0, %k1 +; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} +; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) ret void } @@ -922,7 +973,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq @@ -932,19 +983,30 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test21: ; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k0 ; SKX-NEXT: kshiftlw $2, %k0, %k0 ; SKX-NEXT: kshiftrw $2, %k0, %k1 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test21: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 +; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kshiftlw $2, %k0, %k0 +; SKX_32-NEXT: kshiftrw $2, %k0, %k1 +; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void } @@ -958,13 +1020,13 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; KNL_64-LABEL: test22: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; KNL_64-NEXT: vmovq %xmm1, %xmm1 +; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 @@ -973,14 +1035,14 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; KNL_32-LABEL: test22: ; KNL_32: # BB#0: ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; KNL_32-NEXT: vmovq %xmm1, %xmm1 +; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq .LCPI21_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 @@ -989,12 +1051,25 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; SKX-LABEL: test22: ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k0 ; SKX-NEXT: kshiftlw $2, %k0, %k0 ; SKX-NEXT: kshiftrw $2, %k0, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test22: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 +; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: kshiftlw $2, %k0, %k0 +; SKX_32-NEXT: kshiftrw $2, %k0, %k1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) @@ -1010,7 +1085,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; KNL_64: # BB#0: ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 @@ -1021,7 +1096,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 @@ -1029,10 +1104,20 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; ; SKX-LABEL: test23: ; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test23: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 +; SKX_32-NEXT: vpmovq2m %xmm1, %k1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) @@ -1040,8 +1125,6 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % } define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { -; -; ; KNL_64-LABEL: test24: ; KNL_64: # BB#0: ; KNL_64-NEXT: movb $3, %al @@ -1056,7 +1139,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 -; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 @@ -1068,6 +1151,14 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test24: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> , <2 x i32> undef) @@ -1080,7 +1171,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; KNL_64: # BB#0: ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 @@ -1091,7 +1182,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 @@ -1099,10 +1190,20 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; ; SKX-LABEL: test25: ; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; SKX-NEXT: vpmovq2m %xmm1, %k1 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test25: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 +; SKX_32-NEXT: vpmovq2m %xmm1, %k1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) @@ -1125,7 +1226,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 -; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 @@ -1137,6 +1238,14 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test26: +; SKX_32: # BB#0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> , <2 x i64> %src0) @@ -1198,7 +1307,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 -; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2 +; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl @@ -1210,6 +1319,14 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test28: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX_32-NEXT: movb $3, %al +; SKX_32-NEXT: kmovb %eax, %k1 +; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) ret void } @@ -1356,6 +1473,7 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; ; SKX-LABEL: test30: ; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm2, %xmm2 ; SKX-NEXT: vpmovd2m %xmm2, %k1 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 @@ -1389,6 +1507,45 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1} ; SKX-NEXT: vmovaps %zmm3, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test30: +; SKX_32: # BB#0: +; SKX_32-NEXT: subl $12, %esp +; SKX_32-NEXT: .Ltmp0: +; SKX_32-NEXT: .cfi_def_cfa_offset 16 +; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 +; SKX_32-NEXT: vpmovd2m %xmm2, %k1 +; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) +; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 +; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: # implicit-def: %XMM1 +; SKX_32-NEXT: andb $1, %al +; SKX_32-NEXT: je .LBB29_2 +; SKX_32-NEXT: # BB#1: # %cond.load +; SKX_32-NEXT: vmovd %xmm2, %eax +; SKX_32-NEXT: vmovd (%eax), %xmm1 +; SKX_32-NEXT: .LBB29_2: # %else +; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) +; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: andb $1, %al +; SKX_32-NEXT: je .LBB29_4 +; SKX_32-NEXT: # BB#3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm2, %eax +; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1 +; SKX_32-NEXT: .LBB29_4: # %else2 +; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm0 +; SKX_32-NEXT: kmovb %k1, (%esp) +; SKX_32-NEXT: movb (%esp), %al +; SKX_32-NEXT: andb $1, %al +; SKX_32-NEXT: je .LBB29_6 +; SKX_32-NEXT: # BB#5: # %cond.load4 +; SKX_32-NEXT: vpextrd $2, %xmm2, %eax +; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1 +; SKX_32-NEXT: .LBB29_6: # %else5 +; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} +; SKX_32-NEXT: addl $12, %esp +; SKX_32-NEXT: retl %sext_ind = sext <3 x i32> %ind to <3 x i64> %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind @@ -1446,7 +1603,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_64-LABEL: test_gather_16i32: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 @@ -1458,7 +1615,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_32-LABEL: test_gather_16i32: ; KNL_32: # BB#0: ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 @@ -1467,7 +1624,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i ; SKX-LABEL: test_gather_16i32: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2 ; SKX-NEXT: kshiftrw $8, %k1, %k2 @@ -1479,7 +1636,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i ; SKX_32-LABEL: test_gather_16i32: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 @@ -1491,7 +1648,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_64-LABEL: test_gather_16i64: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} @@ -1513,7 +1670,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 @@ -1528,7 +1685,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; SKX-LABEL: test_gather_16i64: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} @@ -1536,6 +1693,31 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; SKX-NEXT: vmovaps %zmm3, %zmm0 ; SKX-NEXT: vmovaps %zmm4, %zmm1 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_gather_16i64: +; SKX_32: # BB#0: +; SKX_32-NEXT: pushl %ebp +; SKX_32-NEXT: .Ltmp1: +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: .Ltmp2: +; SKX_32-NEXT: .cfi_offset %ebp, -8 +; SKX_32-NEXT: movl %esp, %ebp +; SKX_32-NEXT: .Ltmp3: +; SKX_32-NEXT: .cfi_def_cfa_register %ebp +; SKX_32-NEXT: andl $-64, %esp +; SKX_32-NEXT: subl $64, %esp +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 +; SKX_32-NEXT: kshiftrw $8, %k1, %k2 +; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} +; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: movl %ebp, %esp +; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: retl %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res } @@ -1544,7 +1726,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 ; KNL_64-LABEL: test_gather_16f32: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 @@ -1556,7 +1738,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 ; KNL_32-LABEL: test_gather_16f32: ; KNL_32: # BB#0: ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 @@ -1565,7 +1747,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 ; SKX-LABEL: test_gather_16f32: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2 ; SKX-NEXT: kshiftrw $8, %k1, %k2 @@ -1573,6 +1755,15 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} ; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_gather_16f32: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: retl %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) ret <16 x float> %res } @@ -1580,7 +1771,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; KNL_64-LABEL: test_gather_16f64: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} @@ -1602,7 +1793,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 @@ -1617,7 +1808,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; SKX-LABEL: test_gather_16f64: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} @@ -1625,6 +1816,31 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; SKX-NEXT: vmovaps %zmm3, %zmm0 ; SKX-NEXT: vmovaps %zmm4, %zmm1 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_gather_16f64: +; SKX_32: # BB#0: +; SKX_32-NEXT: pushl %ebp +; SKX_32-NEXT: .Ltmp4: +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: .Ltmp5: +; SKX_32-NEXT: .cfi_offset %ebp, -8 +; SKX_32-NEXT: movl %esp, %ebp +; SKX_32-NEXT: .Ltmp6: +; SKX_32-NEXT: .cfi_def_cfa_register %ebp +; SKX_32-NEXT: andl $-64, %esp +; SKX_32-NEXT: subl $64, %esp +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 +; SKX_32-NEXT: kshiftrw $8, %k1, %k2 +; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} +; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: movl %ebp, %esp +; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: retl %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res } @@ -1633,7 +1849,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> % ; KNL_64-LABEL: test_scatter_16i32: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} @@ -1644,7 +1860,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> % ; KNL_32-LABEL: test_scatter_16i32: ; KNL_32: # BB#0: ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} ; KNL_32-NEXT: retl @@ -1652,7 +1868,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> % ; SKX-LABEL: test_scatter_16i32: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} @@ -1663,7 +1879,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> % ; SKX_32-LABEL: test_scatter_16i32: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} ; SKX_32-NEXT: retl @@ -1674,7 +1890,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> % ; KNL_64-LABEL: test_scatter_16i64: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} @@ -1694,7 +1910,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> % ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 @@ -1708,12 +1924,36 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> % ; SKX-LABEL: test_scatter_16i64: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_scatter_16i64: +; SKX_32: # BB#0: +; SKX_32-NEXT: pushl %ebp +; SKX_32-NEXT: .Ltmp7: +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: .Ltmp8: +; SKX_32-NEXT: .cfi_offset %ebp, -8 +; SKX_32-NEXT: movl %esp, %ebp +; SKX_32-NEXT: .Ltmp9: +; SKX_32-NEXT: .cfi_def_cfa_register %ebp +; SKX_32-NEXT: andl $-64, %esp +; SKX_32-NEXT: subl $64, %esp +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 +; SKX_32-NEXT: kshiftrw $8, %k1, %k2 +; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} +; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} +; SKX_32-NEXT: movl %ebp, %esp +; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) ret void } @@ -1722,7 +1962,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa ; KNL_64-LABEL: test_scatter_16f32: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} @@ -1733,7 +1973,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa ; KNL_32-LABEL: test_scatter_16f32: ; KNL_32: # BB#0: ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} ; KNL_32-NEXT: retl @@ -1741,13 +1981,21 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa ; SKX-LABEL: test_scatter_16f32: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_scatter_16f32: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} +; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) ret void } @@ -1756,7 +2004,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou ; KNL_64-LABEL: test_scatter_16f64: ; KNL_64: # BB#0: ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} @@ -1776,7 +2024,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2 @@ -1790,12 +2038,36 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou ; SKX-LABEL: test_scatter_16f64: ; SKX: # BB#0: ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 -; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_scatter_16f64: +; SKX_32: # BB#0: +; SKX_32-NEXT: pushl %ebp +; SKX_32-NEXT: .Ltmp10: +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: .Ltmp11: +; SKX_32-NEXT: .cfi_offset %ebp, -8 +; SKX_32-NEXT: movl %esp, %ebp +; SKX_32-NEXT: .Ltmp12: +; SKX_32-NEXT: .cfi_def_cfa_register %ebp +; SKX_32-NEXT: andl $-64, %esp +; SKX_32-NEXT: subl $64, %esp +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 +; SKX_32-NEXT: kshiftrw $8, %k1, %k2 +; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} +; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} +; SKX_32-NEXT: movl %ebp, %esp +; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) ret void } diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index 0662d1b22ed..c29933e266b 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR @@ -363,19 +364,99 @@ define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) { declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) -; AVX512-LABEL: test24 -; AVX512: vmovdqu64 (%rdi), %zmm0 {%k1} {z} -; AVX512: kshiftrw $8, %k1, %k1 -; AVX512: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} - define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) { +; AVX512-LABEL: test24: +; AVX512: ## BB#0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} +; AVX512-NEXT: kshiftrw $8, %k1, %k1 +; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} +; AVX512-NEXT: retq +; +; AVX2-LABEL: test24: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm1, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm4, %ymm0 +; AVX2-NEXT: retq +; +; SKX-LABEL: test24: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} +; SKX-NEXT: retq %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) ret <16 x %mystruct*> %res } define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { +; AVX512-LABEL: test_store_16i64: +; AVX512: ## BB#0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} +; AVX512-NEXT: kshiftrw $8, %k1, %k1 +; AVX512-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} +; AVX512-NEXT: retq +; +; AVX2-LABEL: test_store_16i64: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm5, (%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq %ymm4, %ymm1, 96(%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq %ymm3, %ymm1, 64(%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpmaskmovq %ymm2, %ymm0, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; ; SKX-LABEL: test_store_16i64: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 @@ -386,8 +467,47 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr } declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { +; AVX512-LABEL: test_store_16f64: +; AVX512: ## BB#0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovupd %zmm1, (%rdi) {%k1} +; AVX512-NEXT: kshiftrw $8, %k1, %k1 +; AVX512-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} +; AVX512-NEXT: retq +; +; AVX2-LABEL: test_store_16f64: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; ; SKX-LABEL: test_store_16f64: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 @@ -398,8 +518,53 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl } declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { +; AVX512-LABEL: test_load_16i64: +; AVX512: ## BB#0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512-NEXT: kshiftrw $8, %k1, %k1 +; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: vmovaps %zmm2, %zmm1 +; AVX512-NEXT: retq +; +; AVX2-LABEL: test_load_16i64: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vpmaskmovq (%rdi), %ymm5, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm7, %xmm7 +; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7 +; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7 +; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm7, %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm6, %xmm6 +; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6 +; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6 +; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm6, %ymm10 +; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1 +; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vmovapd %ymm5, %ymm0 +; AVX2-NEXT: retq +; ; SKX-LABEL: test_load_16i64: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 @@ -412,8 +577,53 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64 } declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { +; AVX512-LABEL: test_load_16f64: +; AVX512: ## BB#0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512-NEXT: kshiftrw $8, %k1, %k1 +; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: vmovaps %zmm2, %zmm1 +; AVX512-NEXT: retq +; +; AVX2-LABEL: test_load_16f64: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 +; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm7, %xmm7 +; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7 +; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7 +; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm7, %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm6, %xmm6 +; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6 +; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6 +; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm10 +; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1 +; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vmovapd %ymm5, %ymm0 +; AVX2-NEXT: retq +; ; SKX-LABEL: test_load_16f64: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 @@ -427,8 +637,112 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { +; AVX512-LABEL: test_load_32f64: +; AVX512: ## BB#0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5 +; AVX512-NEXT: vpslld $31, %zmm5, %zmm5 +; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512-NEXT: vmovupd 128(%rdi), %zmm3 {%k1} +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k2 +; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k2} +; AVX512-NEXT: kshiftrw $8, %k1, %k1 +; AVX512-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} +; AVX512-NEXT: kshiftrw $8, %k2, %k1 +; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: vmovaps %zmm2, %zmm1 +; AVX512-NEXT: vmovaps %zmm3, %zmm2 +; AVX512-NEXT: vmovaps %zmm4, %zmm3 +; AVX512-NEXT: retq +; +; AVX2-LABEL: test_load_32f64: +; AVX2: ## BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: Ltmp0: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: Ltmp1: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: Ltmp2: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm8, %xmm8 +; AVX2-NEXT: vpsrad $31, %xmm8, %xmm8 +; AVX2-NEXT: vpmovsxdq %xmm8, %ymm8 +; AVX2-NEXT: vmaskmovpd 32(%rsi), %ymm8, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm10, %xmm10 +; AVX2-NEXT: vpsrad $31, %xmm10, %xmm10 +; AVX2-NEXT: vpmovsxdq %xmm10, %ymm10 +; AVX2-NEXT: vmaskmovpd 64(%rsi), %ymm10, %ymm11 +; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm12, %xmm12 +; AVX2-NEXT: vpsrad $31, %xmm12, %xmm12 +; AVX2-NEXT: vpmovsxdq %xmm12, %ymm12 +; AVX2-NEXT: vmaskmovpd 96(%rsi), %ymm12, %ymm13 +; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm2, %ymm8 +; AVX2-NEXT: vblendvpd %ymm10, %ymm11, %ymm3, %ymm9 +; AVX2-NEXT: vblendvpd %ymm12, %ymm13, %ymm4, %ymm11 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 +; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmaskmovpd 192(%rsi), %ymm4, %ymm12 +; AVX2-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm3 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm6 +; AVX2-NEXT: vblendvpd %ymm4, %ymm12, %ymm7, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm7, %xmm7 +; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7 +; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7 +; AVX2-NEXT: vmaskmovpd 224(%rsi), %ymm7, %ymm10 +; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm7 +; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vmaskmovpd 128(%rsi), %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, 128(%rdi) +; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vmovapd %ymm6, 224(%rdi) +; AVX2-NEXT: vmovapd %ymm4, 192(%rdi) +; AVX2-NEXT: vmovapd %ymm3, 160(%rdi) +; AVX2-NEXT: vmovapd %ymm11, 96(%rdi) +; AVX2-NEXT: vmovapd %ymm9, 64(%rdi) +; AVX2-NEXT: vmovapd %ymm8, 32(%rdi) +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; ; SKX-LABEL: test_load_32f64: ; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ; SKX-NEXT: vpmovb2m %ymm0, %k1 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} ; SKX-NEXT: kshiftrd $16, %k1, %k2 diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index cfced3c36de..631968f6afa 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2094,7 +2094,7 @@ define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1 ; AVX512F-LABEL: test_vshuff64x2_512_maskz: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-NEXT: retq @@ -2102,7 +2102,7 @@ define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1 ; AVX512F-32-LABEL: test_vshuff64x2_512_maskz: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-32-NEXT: vpandq .LCPI122_0, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpsllvq .LCPI122_0, %zmm2, %zmm2 ; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-32-NEXT: retl @@ -2115,7 +2115,7 @@ define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> ; AVX512F-LABEL: test_vshufi64x2_512_mask: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-NEXT: retq @@ -2123,7 +2123,7 @@ define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> ; AVX512F-32-LABEL: test_vshufi64x2_512_mask: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-32-NEXT: vpandq .LCPI123_0, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpsllvq .LCPI123_0, %zmm2, %zmm2 ; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-32-NEXT: retl @@ -2152,7 +2152,7 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> ; AVX512F-LABEL: test_vshuff64x2_512_mem_mask: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] ; AVX512F-NEXT: retq @@ -2160,7 +2160,7 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-32-NEXT: vpandq .LCPI125_0, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsllvq .LCPI125_0, %zmm1, %zmm1 ; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] @@ -2175,7 +2175,7 @@ define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> ; AVX512F-LABEL: test_vshuff64x2_512_mem_maskz: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] ; AVX512F-NEXT: retq @@ -2183,7 +2183,7 @@ define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-32-NEXT: vpandq .LCPI126_0, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsllvq .LCPI126_0, %zmm1, %zmm1 ; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index b9c29dee3be..a387f894a06 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ @@ -11,9 +12,11 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { ; ; VL_BW_DQ-LABEL: shuf2i1_1_0: ; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: retq @@ -31,12 +34,14 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; ; VL_BW_DQ-LABEL: shuf2i1_1_2: ; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: movb $1, %al ; VL_BW_DQ-NEXT: kmovb %eax, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 ; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0 +; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: retq @@ -53,9 +58,11 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { ; ; VL_BW_DQ-LABEL: shuf4i1_3_2_10: ; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 ; VL_BW_DQ-NEXT: retq @@ -71,7 +78,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 @@ -83,6 +90,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-NEXT: retq @@ -102,7 +110,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -116,7 +124,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0 +; VL_BW_DQ-NEXT: vpslld $31, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %a2 = icmp eq <16 x i32> %a, %a1 @@ -137,10 +146,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpsllw $15, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 ; VL_BW_DQ-NEXT: retq @@ -157,7 +168,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 @@ -169,6 +180,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-NEXT: retq @@ -186,7 +198,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -198,6 +210,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq @@ -214,7 +227,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1] -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -224,6 +237,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; VL_BW_DQ-NEXT: kmovb %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1] +; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq @@ -242,7 +256,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -254,6 +268,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq @@ -272,7 +287,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -284,7 +299,8 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -306,7 +322,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -320,6 +336,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq @@ -333,26 +350,27 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 -; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vpandq %zmm3, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: ; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %c = shufflevector <8 x i1> , <8 x i1> %a, <8 x i32> @@ -367,7 +385,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: retq @@ -377,6 +395,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; VL_BW_DQ-NEXT: kmovw %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 +; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovw %k0, %eax ; VL_BW_DQ-NEXT: retq @@ -387,6 +406,27 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { } define i64 @shuf64i1_zero(i64 %a) { +; AVX512F-LABEL: shuf64i1_zero: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: .Ltmp0: +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: .Ltmp1: +; AVX512F-NEXT: .cfi_offset %rbp, -16 +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: .Ltmp2: +; AVX512F-NEXT: .cfi_def_cfa_register %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: movb $0, (%rsp) +; AVX512F-NEXT: movl (%rsp), %ecx +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shlq $32, %rax +; AVX512F-NEXT: orq %rcx, %rax +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; ; VL_BW_DQ-LABEL: shuf64i1_zero: ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: kxorq %k0, %k0, %k0