llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_cvtsi2ss32 : GCCBuiltin<"__builtin_ia32_cvtsi2ss32">,
- Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
- llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_cvtsi2ss64 : GCCBuiltin<"__builtin_ia32_cvtsi2ss64">,
- Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
- llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
+ Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
+ llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_cvtsi2sd32 : GCCBuiltin<"__builtin_ia32_cvtsi2sd32">,
- Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
- llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_cvtsi2sd64 : GCCBuiltin<"__builtin_ia32_cvtsi2sd64">,
- Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
- llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
+ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
+ llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_cvtb2mask_128 : GCCBuiltin<"__builtin_ia32_cvtb2mask128">,
+ Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtb2mask_256 : GCCBuiltin<"__builtin_ia32_cvtb2mask256">,
+ Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtb2mask_512 : GCCBuiltin<"__builtin_ia32_cvtb2mask512">,
+ Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_cvtw2mask_128 : GCCBuiltin<"__builtin_ia32_cvtw2mask128">,
+ Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtw2mask_256 : GCCBuiltin<"__builtin_ia32_cvtw2mask256">,
+ Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtw2mask_512 : GCCBuiltin<"__builtin_ia32_cvtw2mask512">,
+ Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_cvtd2mask_128 : GCCBuiltin<"__builtin_ia32_cvtd2mask128">,
+ Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtd2mask_256 : GCCBuiltin<"__builtin_ia32_cvtd2mask256">,
+ Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtd2mask_512 : GCCBuiltin<"__builtin_ia32_cvtd2mask512">,
+ Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_cvtq2mask_128 : GCCBuiltin<"__builtin_ia32_cvtq2mask128">,
+ Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtq2mask_256 : GCCBuiltin<"__builtin_ia32_cvtq2mask256">,
+ Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_cvtq2mask_512 : GCCBuiltin<"__builtin_ia32_cvtq2mask512">,
+ Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty], [IntrNoMem]>;
def int_x86_avx512_cvtmask2b_128 : GCCBuiltin<"__builtin_ia32_cvtmask2b128">,
Intrinsic<[llvm_v16i8_ty], [llvm_i16_ty], [IntrNoMem]>;
return SDValue();
}
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type.");
+
+ // Shift LSB to MSB and use VPMOVB2M - SKX.
+ unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+ if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
+ Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M
+ ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() &&
+ Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M
+ // Shift packed bytes not supported natively, bitcast to dword
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+ DAG.getBitcast(ExtVT, In),
+ DAG.getConstant(ShiftInx, DL, ExtVT));
+ ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+ if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
+ Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M
+ ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() &&
+ Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M
+
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+
+ // Shift LSB to MSB, extend if necessary and use TESTM.
+ unsigned NumElts = InVT.getVectorNumElements();
+ if (InVT.getSizeInBits() < 512 &&
+ (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 ||
+ !Subtarget->hasVLX())) {
+ assert((NumElts == 8 || NumElts == 16) && "Unexected vector type.");
+
+ // TESTD/Q should be used (if BW supported we use CVT2MASK above),
+ // so vector should be extended to packed dword/qword.
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+ InVT = ExtVT;
+ ShiftInx = InVT.getScalarSizeInBits() - 1;
+ }
+
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+}
+
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
- // move vector to mask - truncate solution for SKX
- if (VT.getVectorElementType() == MVT::i1) {
- if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
- Subtarget->hasBWI())
- return Op; // legal, will go to VPMOVB2M, VPMOVW2M
- if ((InVT.is256BitVector() || InVT.is128BitVector())
- && InVT.getScalarSizeInBits() <= 16 &&
- Subtarget->hasBWI() && Subtarget->hasVLX())
- return Op; // legal, will go to VPMOVB2M, VPMOVW2M
- if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
- Subtarget->hasDQI())
- return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
- if ((InVT.is256BitVector() || InVT.is128BitVector())
- && InVT.getScalarSizeInBits() >= 32 &&
- Subtarget->hasDQI() && Subtarget->hasVLX())
- return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
- }
-
- if (VT.getVectorElementType() == MVT::i1) {
- assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
- unsigned NumElts = InVT.getVectorNumElements();
- assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
- if (InVT.getSizeInBits() < 512) {
- MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
- In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
- InVT = ExtVT;
- }
-
- SDValue OneV =
- DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT);
- SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
- return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
- }
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerTruncateVecI1(Op, DAG, Subtarget);
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget->hasAVX512()) {
}
case BROADCASTM: {
SDValue Mask = Op.getOperand(1);
- MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits());
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
Mask = DAG.getBitcast(MaskVT, Mask);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
}
Src2, Src1);
return DAG.getBitcast(VT, Res);
}
+ case CONVERT_TO_MASK: {
+ MVT SrcVT = Op.getOperand(1).getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+ SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
+ Op.getOperand(1));
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), CvtMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
case CONVERT_MASK_TO_VEC: {
SDValue Mask = Op.getOperand(1);
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD";
case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD";
+ case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
case X86ISD::VSHL: return "X86ISD::VSHL";
// Vector signed/unsigned integer to double.
CVTDQ2PD, CVTUDQ2PD,
+ // Convert a vector to mask, set bits base on MSB.
+ CVT2MASK,
+
// 128-bit vector logical left / right shift
VSHLDQ, VSRLDQ,
multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _.KRC:$dst, (trunc (_.VT _.RC:$src)))]>, EVEX;
+ [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
}
multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
SDTCisOpSmallerThanOp<0, 1>,
SDTCisVT<2, i32>]>>;
+def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
+
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
//===----------------------------------------------------------------------===//
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
- TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC
+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
};
struct IntrinsicData {
X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
-; KNL-LABEL: test1
-; KNL: vxorps
define <16 x i1> @test1() {
+; ALL_X64-LABEL: test1:
+; ALL_X64: ## BB#0:
+; ALL_X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ALL_X64-NEXT: retq
+;
+; KNL_X32-LABEL: test1:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; KNL_X32-NEXT: retl
ret <16 x i1> zeroinitializer
}
-; SKX-LABEL: test2
-; SKX: vpmovb2m
-; SKX: vpmovb2m
-; SKX: kandw
-; SKX: vpmovm2b
-; KNL-LABEL: test2
-; KNL: vpmovsxbd
-; KNL: vpmovsxbd
-; KNL: vpandd
-; KNL: vpmovdb
define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
+; KNL-LABEL: test2:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test2:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT: vpmovb2m %xmm1, %k0
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: kandw %k0, %k1, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test2:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_X32-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL_X32-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
+; KNL_X32-NEXT: vpbroadcastd LCPI1_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
+; KNL_X32-NEXT: retl
%c = and <16 x i1>%a, %b
ret <16 x i1> %c
}
-; SKX-LABEL: test3
-; SKX: vpmovw2m
-; SKX: vpmovw2m
-; SKX: kandb
-; SKX: vpmovm2w
define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
+; KNL-LABEL: test3:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
+; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqw %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test3:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT: vpmovw2m %xmm1, %k0
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: kandb %k0, %k1, %k0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test3:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: vpmovsxwq %xmm1, %zmm1
+; KNL_X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,0,63,0,63,0,63,0,63,0,63,0,63,0,63,0]
+; KNL_X32-NEXT: vpsllvq %zmm2, %zmm1, %zmm1
+; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_X32-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
+; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
+; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0
+; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
+; KNL_X32-NEXT: retl
%c = and <8 x i1>%a, %b
ret <8 x i1> %c
}
-; SKX-LABEL: test4
-; SKX: vpmovd2m
-; SKX: vpmovd2m
-; SKX: kandw
-; SKX: vpmovm2d
define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
+; KNL-LABEL: test4:
+; KNL: ## BB#0:
+; KNL-NEXT: vandps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test4:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vpslld $31, %xmm1, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test4:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: vandps %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT: retl
%c = and <4 x i1>%a, %b
ret <4 x i1> %c
}
-; SKX-LABEL: test5
-; SKX: vpcmpgtd
-; SKX: vpmovm2w
-; SKX: call
-; SKX: vpmovzxwd
declare <8 x i1> @func8xi1(<8 x i1> %a)
+
define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
+; KNL-LABEL: test5:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rax
+; KNL-NEXT: Ltmp0:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: callq _func8xi1
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL-NEXT: vpsrad $31, %ymm0, %ymm0
+; KNL-NEXT: popq %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test5:
+; SKX: ## BB#0:
+; SKX-NEXT: pushq %rax
+; SKX-NEXT: Ltmp0:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: callq _func8xi1
+; SKX-NEXT: vpmovzxwd %xmm0, %ymm0
+; SKX-NEXT: vpslld $31, %ymm0, %ymm0
+; SKX-NEXT: vpsrad $31, %ymm0, %ymm0
+; SKX-NEXT: popq %rax
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test5:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: subl $12, %esp
+; KNL_X32-NEXT: Ltmp0:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 16
+; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
+; KNL_X32-NEXT: calll L_func8xi1$stub
+; KNL_X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL_X32-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL_X32-NEXT: vpsrad $31, %ymm0, %ymm0
+; KNL_X32-NEXT: addl $12, %esp
+; KNL_X32-NEXT: retl
%cmpRes = icmp sgt <8 x i32>%a, %b
%resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes)
%res = sext <8 x i1>%resi to <8 x i32>
declare <16 x i1> @func16xi1(<16 x i1> %a)
-; KNL-LABEL: test6
-; KNL: vpbroadcastd
-; KNL: vpmovdb
-; KNL: call
-; KNL: vpmovzxbd
-; KNL: vpslld $31, %zmm
-; KNL: vpsrad $31, %zmm
define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
+; KNL-LABEL: test6:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rax
+; KNL-NEXT: Ltmp1:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: callq _func16xi1
+; KNL-NEXT: vpmovzxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vpsrad $31, %zmm0, %zmm0
+; KNL-NEXT: popq %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test6:
+; SKX: ## BB#0:
+; SKX-NEXT: pushq %rax
+; SKX-NEXT: Ltmp1:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: callq _func16xi1
+; SKX-NEXT: vpmovzxbd %xmm0, %zmm0
+; SKX-NEXT: vpslld $31, %zmm0, %zmm0
+; SKX-NEXT: vpsrad $31, %zmm0, %zmm0
+; SKX-NEXT: popq %rax
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test6:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: subl $12, %esp
+; KNL_X32-NEXT: Ltmp1:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 16
+; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; KNL_X32-NEXT: vpbroadcastd LCPI5_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
+; KNL_X32-NEXT: calll L_func16xi1$stub
+; KNL_X32-NEXT: vpmovzxbd %xmm0, %zmm0
+; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL_X32-NEXT: vpsrad $31, %zmm0, %zmm0
+; KNL_X32-NEXT: addl $12, %esp
+; KNL_X32-NEXT: retl
%cmpRes = icmp sgt <16 x i32>%a, %b
%resi = call <16 x i1> @func16xi1(<16 x i1> %cmpRes)
%res = sext <16 x i1>%resi to <16 x i32>
}
declare <4 x i1> @func4xi1(<4 x i1> %a)
-; SKX-LABEL: test7
-; SKX: vpmovm2d
-; SKX: call
-; SKX: vpslld $31, %xmm
-; SKX: vpsrad $31, %xmm
define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
+; KNL-LABEL: test7:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rax
+; KNL-NEXT: Ltmp2:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: callq _func4xi1
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: popq %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test7:
+; SKX: ## BB#0:
+; SKX-NEXT: pushq %rax
+; SKX-NEXT: Ltmp2:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: callq _func4xi1
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpsrad $31, %xmm0, %xmm0
+; SKX-NEXT: popq %rax
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test7:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: subl $12, %esp
+; KNL_X32-NEXT: Ltmp2:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 16
+; KNL_X32-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT: calll L_func4xi1$stub
+; KNL_X32-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL_X32-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL_X32-NEXT: addl $12, %esp
+; KNL_X32-NEXT: retl
%cmpRes = icmp sgt <4 x i32>%a, %b
%resi = call <4 x i1> @func4xi1(<4 x i1> %cmpRes)
%res = sext <4 x i1>%resi to <4 x i32>
ret <4 x i32> %res
}
-; SKX-LABEL: test7a
-; SKX: call
-; SKX: vpmovw2m %xmm0, %k0
-; SKX: kandb
define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
+; KNL-LABEL: test7a:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rax
+; KNL-NEXT: Ltmp3:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: callq _func8xi1
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: movb $85, %al
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
+; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqw %zmm0, %xmm0
+; KNL-NEXT: popq %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test7a:
+; SKX: ## BB#0:
+; SKX-NEXT: pushq %rax
+; SKX-NEXT: Ltmp3:
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: callq _func8xi1
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: movb $85, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: kandb %k1, %k0, %k0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: popq %rax
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test7a:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: subl $12, %esp
+; KNL_X32-NEXT: Ltmp3:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 16
+; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
+; KNL_X32-NEXT: calll L_func8xi1$stub
+; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_X32-NEXT: vpsllvq LCPI7_0, %zmm0, %zmm0
+; KNL_X32-NEXT: movb $85, %al
+; KNL_X32-NEXT: movzbl %al, %eax
+; KNL_X32-NEXT: kmovw %eax, %k1
+; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
+; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0
+; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
+; KNL_X32-NEXT: addl $12, %esp
+; KNL_X32-NEXT: retl
%cmpRes = icmp sgt <8 x i32>%a, %b
%resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes)
%res = and <8 x i1>%resi, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
ret <8 x i1> %res
}
-
-; KNL_X32-LABEL: test8
-; KNL_X32: testb $1, 4(%esp)
-; KNL_X32:jne
-
-; KNL-LABEL: test8
-; KNL: testb $1, %dil
-; KNL:jne
-
define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) {
+; ALL_X64-LABEL: test8:
+; ALL_X64: ## BB#0:
+; ALL_X64-NEXT: testb $1, %dil
+; ALL_X64-NEXT: jne LBB8_2
+; ALL_X64-NEXT: ## BB#1:
+; ALL_X64-NEXT: vmovaps %zmm1, %zmm0
+; ALL_X64-NEXT: LBB8_2:
+; ALL_X64-NEXT: retq
+;
+; KNL_X32-LABEL: test8:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT: jne LBB8_2
+; KNL_X32-NEXT: ## BB#1:
+; KNL_X32-NEXT: vmovaps %zmm1, %zmm0
+; KNL_X32-NEXT: LBB8_2:
+; KNL_X32-NEXT: retl
%res = select i1 %cond, <16 x i8> %a1, <16 x i8> %a2
ret <16 x i8> %res
}
-; KNL-LABEL: test9
-; KNL: vucomisd
-; KNL: setb
define i1 @test9(double %a, double %b) {
+; ALL_X64-LABEL: test9:
+; ALL_X64: ## BB#0:
+; ALL_X64-NEXT: vucomisd %xmm0, %xmm1
+; ALL_X64-NEXT: setb %al
+; ALL_X64-NEXT: retq
+;
+; KNL_X32-LABEL: test9:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0
+; KNL_X32-NEXT: vucomisd {{[0-9]+}}(%esp), %xmm0
+; KNL_X32-NEXT: setb %al
+; KNL_X32-NEXT: retl
%c = fcmp ugt double %a, %b
ret i1 %c
}
-; KNL_X32-LABEL: test10
-; KNL_X32: testb $1, 12(%esp)
-; KNL_X32: cmovnel
-
-; KNL-LABEL: test10
-; KNL: testb $1, %dl
-; KNL: cmovel
define i32 @test10(i32 %a, i32 %b, i1 %cond) {
+; ALL_X64-LABEL: test10:
+; ALL_X64: ## BB#0:
+; ALL_X64-NEXT: testb $1, %dl
+; ALL_X64-NEXT: cmovel %esi, %edi
+; ALL_X64-NEXT: movl %edi, %eax
+; ALL_X64-NEXT: retq
+;
+; KNL_X32-LABEL: test10:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; KNL_X32-NEXT: cmovnel %eax, %ecx
+; KNL_X32-NEXT: movl (%ecx), %eax
+; KNL_X32-NEXT: retl
%c = select i1 %cond, i32 %a, i32 %b
ret i32 %c
}
-; KNL-LABEL: test11
-; KNL: cmp
-; KNL: setg
define i1 @test11(i32 %a, i32 %b) {
+; ALL_X64-LABEL: test11:
+; ALL_X64: ## BB#0:
+; ALL_X64-NEXT: cmpl %esi, %edi
+; ALL_X64-NEXT: setg %al
+; ALL_X64-NEXT: retq
+;
+; KNL_X32-LABEL: test11:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT: setg %al
+; KNL_X32-NEXT: retl
%c = icmp sgt i32 %a, %b
ret i1 %c
}
-; KNL-LABEL: test12
-; KNL: callq _test11
-;; return value in %al
-; KNL: movzbl %al, %ebx
-; KNL: callq _test10
-; KNL: testb $1, %bl
-
define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
+; ALL_X64-LABEL: test12:
+; ALL_X64: ## BB#0:
+; ALL_X64-NEXT: pushq %rbp
+; ALL_X64-NEXT: Ltmp4:
+; ALL_X64-NEXT: .cfi_def_cfa_offset 16
+; ALL_X64-NEXT: pushq %r14
+; ALL_X64-NEXT: Ltmp5:
+; ALL_X64-NEXT: .cfi_def_cfa_offset 24
+; ALL_X64-NEXT: pushq %rbx
+; ALL_X64-NEXT: Ltmp6:
+; ALL_X64-NEXT: .cfi_def_cfa_offset 32
+; ALL_X64-NEXT: Ltmp7:
+; ALL_X64-NEXT: .cfi_offset %rbx, -32
+; ALL_X64-NEXT: Ltmp8:
+; ALL_X64-NEXT: .cfi_offset %r14, -24
+; ALL_X64-NEXT: Ltmp9:
+; ALL_X64-NEXT: .cfi_offset %rbp, -16
+; ALL_X64-NEXT: movl %esi, %r14d
+; ALL_X64-NEXT: movl %edi, %ebp
+; ALL_X64-NEXT: movl %edx, %esi
+; ALL_X64-NEXT: callq _test11
+; ALL_X64-NEXT: movzbl %al, %ebx
+; ALL_X64-NEXT: movl %ebp, %edi
+; ALL_X64-NEXT: movl %r14d, %esi
+; ALL_X64-NEXT: movl %ebx, %edx
+; ALL_X64-NEXT: callq _test10
+; ALL_X64-NEXT: xorl %ecx, %ecx
+; ALL_X64-NEXT: testb $1, %bl
+; ALL_X64-NEXT: cmovel %ecx, %eax
+; ALL_X64-NEXT: popq %rbx
+; ALL_X64-NEXT: popq %r14
+; ALL_X64-NEXT: popq %rbp
+; ALL_X64-NEXT: retq
+;
+; KNL_X32-LABEL: test12:
+; KNL_X32: ## BB#0:
+; KNL_X32-NEXT: pushl %ebx
+; KNL_X32-NEXT: Ltmp4:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 8
+; KNL_X32-NEXT: pushl %edi
+; KNL_X32-NEXT: Ltmp5:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 12
+; KNL_X32-NEXT: pushl %esi
+; KNL_X32-NEXT: Ltmp6:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 16
+; KNL_X32-NEXT: subl $16, %esp
+; KNL_X32-NEXT: Ltmp7:
+; KNL_X32-NEXT: .cfi_def_cfa_offset 32
+; KNL_X32-NEXT: Ltmp8:
+; KNL_X32-NEXT: .cfi_offset %esi, -16
+; KNL_X32-NEXT: Ltmp9:
+; KNL_X32-NEXT: .cfi_offset %edi, -12
+; KNL_X32-NEXT: Ltmp10:
+; KNL_X32-NEXT: .cfi_offset %ebx, -8
+; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT: movl %edi, (%esp)
+; KNL_X32-NEXT: calll _test11
+; KNL_X32-NEXT: movb %al, %bl
+; KNL_X32-NEXT: movzbl %bl, %eax
+; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT: movl %edi, (%esp)
+; KNL_X32-NEXT: calll _test10
+; KNL_X32-NEXT: xorl %ecx, %ecx
+; KNL_X32-NEXT: testb $1, %bl
+; KNL_X32-NEXT: cmovel %ecx, %eax
+; KNL_X32-NEXT: addl $16, %esp
+; KNL_X32-NEXT: popl %esi
+; KNL_X32-NEXT: popl %edi
+; KNL_X32-NEXT: popl %ebx
+; KNL_X32-NEXT: retl
%cond = call i1 @test11(i32 %a1, i32 %b1)
%res = call i32 @test10(i32 %a1, i32 %a2, i1 %cond)
%res1 = select i1 %cond, i32 %res, i32 0
ret i32 %res1
-}
\ No newline at end of file
+}
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX
-
- ;SKX-LABEL: zext_8x8mem_to_8x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x8mem_to_8x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vpsllw $15, %xmm0, %xmm0
+; KNL-NEXT: vpsraw $15, %xmm0, %xmm0
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x8mem_to_8x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
- %x = zext <8 x i8> %a to <8 x i16>
- %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+ %x = zext <8 x i8> %a to <8 x i16>
+ %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
ret <8 x i16> %ret
}
-;SKX-LABEL: sext_8x8mem_to_8x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x8mem_to_8x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbw (%rdi), %xmm1
+; KNL-NEXT: vpsllw $15, %xmm0, %xmm0
+; KNL-NEXT: vpsraw $15, %xmm0, %xmm0
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8x8mem_to_8x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
- %x = sext <8 x i8> %a to <8 x i16>
- %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+ %x = sext <8 x i8> %a to <8 x i16>
+ %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
ret <8 x i16> %ret
}
-;SKX-LABEL: zext_16x8mem_to_16x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %xmm0, %k1
-;SKX-NEXT: vpmovzxbw (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
+
define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8mem_to_16x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16x8mem_to_16x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbw (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
- %x = zext <16 x i8> %a to <16 x i16>
- %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+ %x = zext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
ret <16 x i16> %ret
}
-;SKX-LABEL: sext_16x8mem_to_16x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %xmm0, %k1
-;SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8mem_to_16x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpmovsxbw (%rdi), %ymm1
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_16x8mem_to_16x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
- %x = sext <16 x i8> %a to <16 x i16>
- %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+ %x = sext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
ret <16 x i16> %ret
}
-;SKX-LABEL: zext_16x8_to_16x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovzxbw %xmm0, %ymm0
-;SKX-NEXT: retq
-define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
- %x = zext <16 x i8> %a to <16 x i16>
+define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; KNL-LABEL: zext_16x8_to_16x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16x8_to_16x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovzxbw %xmm0, %ymm0
+; SKX-NEXT: retq
+ %x = zext <16 x i8> %a to <16 x i16>
ret <16 x i16> %x
}
-;SKX-LABEL: zext_16x8_to_16x16_mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %xmm1, %k1
-;SKX-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z}
-;SKX-NEXT: retq
-define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
- %x = zext <16 x i8> %a to <16 x i16>
- %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8_to_16x16_mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16x8_to_16x16_mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT: vpmovb2m %xmm1, %k1
+; SKX-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %x = zext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
ret <16 x i16> %ret
}
-;SKX-LABEL: sext_16x8_to_16x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxbw %xmm0, %ymm0
-;SKX-NEXT: retq
-define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
- %x = sext <16 x i8> %a to <16 x i16>
+define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; ALL-LABEL: sext_16x8_to_16x16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbw %xmm0, %ymm0
+; ALL-NEXT: retq
+ %x = sext <16 x i8> %a to <16 x i16>
ret <16 x i16> %x
}
-;SKX-LABEL: sext_16x8_to_16x16_mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %xmm1, %k1
-;SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z}
-;SKX-NEXT: retq
-define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
- %x = sext <16 x i8> %a to <16 x i16>
- %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8_to_16x16_mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
+; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_16x8_to_16x16_mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT: vpmovb2m %xmm1, %k1
+; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %x = sext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
ret <16 x i16> %ret
}
-;SKX-LABEL: zext_32x8mem_to_32x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %ymm0, %k1
-;SKX-NEXT: vpmovzxbw (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_32x8mem_to_32x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT: vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT: vpand %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1
+; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_32x8mem_to_32x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vpmovzxbw (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
- %x = zext <32 x i8> %a to <32 x i16>
- %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ %x = zext <32 x i8> %a to <32 x i16>
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
-;SKX-LABEL: sext_32x8mem_to_32x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %ymm0, %k1
-;SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_32x8mem_to_32x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm1
+; KNL-NEXT: vpmovsxbw (%rdi), %ymm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT: vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT: vpand %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
+; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1
+; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_32x8mem_to_32x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
- %x = sext <32 x i8> %a to <32 x i16>
- %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ %x = sext <32 x i8> %a to <32 x i16>
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
-;SKX-LABEL: zext_32x8_to_32x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovzxbw %ymm0, %zmm0
-;SKX-NEXT: retq
-define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
- %x = zext <32 x i8> %a to <32 x i16>
+define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; KNL-LABEL: zext_32x8_to_32x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_32x8_to_32x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovzxbw %ymm0, %zmm0
+; SKX-NEXT: retq
+ %x = zext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
}
-;SKX-LABEL: zext_32x8_to_32x16_mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %ymm1, %k1
-;SKX-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z}
-;SKX-NEXT: retq
-define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_32x8_to_32x16_mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT: vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_32x8_to_32x16_mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT: vpmovb2m %ymm1, %k1
+; SKX-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
- %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
-;SKX-LABEL: sext_32x8_to_32x16:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxbw %ymm0, %zmm0
-;SKX-NEXT: retq
-define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
- %x = sext <32 x i8> %a to <32 x i16>
+define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; KNL-LABEL: sext_32x8_to_32x16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbw %xmm0, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpmovsxbw %xmm0, %ymm1
+; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_32x8_to_32x16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovsxbw %ymm0, %zmm0
+; SKX-NEXT: retq
+ %x = sext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
}
-;SKX-LABEL: sext_32x8_to_32x16_mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %ymm1, %k1
-;SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-;SKX-NEXT: retq
-define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_32x8_to_32x16_mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT: vpmovsxbw %xmm2, %ymm2
+; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT: vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_32x8_to_32x16_mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT: vpmovb2m %ymm1, %k1
+; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%x = sext <32 x i8> %a to <32 x i16>
- %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
-;SKX-LABEL: zext_4x8mem_to_4x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x8mem_to_4x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4x8mem_to_4x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
- %x = zext <4 x i8> %a to <4 x i32>
- %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+ %x = zext <4 x i8> %a to <4 x i32>
+ %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
ret <4 x i32> %ret
}
-;SKX-LABEL: sext_4x8mem_to_4x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x8mem_to_4x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd (%rdi), %xmm1
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_4x8mem_to_4x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
- %x = sext <4 x i8> %a to <4 x i32>
- %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+ %x = sext <4 x i8> %a to <4 x i32>
+ %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
ret <4 x i32> %ret
}
-;SKX-LABEL: zext_8x8mem_to_8x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovzxbd (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x8mem_to_8x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x8mem_to_8x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
- %x = zext <8 x i8> %a to <8 x i32>
- %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+ %x = zext <8 x i8> %a to <8 x i32>
+ %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
ret <8 x i32> %ret
}
-;SKX-LABEL: sext_8x8mem_to_8x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x8mem_to_8x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxbd (%rdi), %ymm0
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8x8mem_to_8x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
- %x = sext <8 x i8> %a to <8 x i32>
- %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+ %x = sext <8 x i8> %a to <8 x i32>
+ %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
ret <8 x i32> %ret
}
-;KNL-LABEL: zext_16x8mem_to_16x32:
-;KNL: vpmovzxbd (%rdi), %zmm0 {%k1} {z}
-;KNL-NEXT: retq
define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8mem_to_16x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16x8mem_to_16x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
%x = zext <16 x i8> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %ret
}
-;KNL-LABEL: sext_16x8mem_to_16x32:
-;KNL: vpmovsxbd (%rdi), %zmm0 {%k1} {z}
-;KNL-NEXT: retq
define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8mem_to_16x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_16x8mem_to_16x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
%x = sext <16 x i8> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %ret
}
-;KNL-LABEL: zext_16x8_to_16x32_mask:
-;KNL: vpmovzxbd %xmm0, %zmm0 {%k1} {z}
-;KNL-NEXT: retq
define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x8_to_16x32_mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16x8_to_16x32_mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT: vpmovb2m %xmm1, %k1
+; SKX-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%x = zext <16 x i8> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %ret
}
-;KNL-LABEL: sext_16x8_to_16x32_mask:
-;KNL: vpmovsxbd %xmm0, %zmm0 {%k1} {z}
-;KNL-NEXT: retq
define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x8_to_16x32_mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_16x8_to_16x32_mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT: vpmovb2m %xmm1, %k1
+; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%x = sext <16 x i8> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %ret
}
-; KNL-LABEL: zext_16x8_to_16x32
-; KNL: vpmovzxbd {{.*}}%zmm
-; KNL: ret
define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; ALL-LABEL: zext_16x8_to_16x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbd %xmm0, %zmm0
+; ALL-NEXT: retq
%x = zext <16 x i8> %i to <16 x i32>
ret <16 x i32> %x
}
-; KNL-LABEL: sext_16x8_to_16x32
-; KNL: vpmovsxbd {{.*}}%zmm
-; KNL: ret
define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; ALL-LABEL: sext_16x8_to_16x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbd %xmm0, %zmm0
+; ALL-NEXT: retq
%x = sext <16 x i8> %i to <16 x i32>
ret <16 x i32> %x
}
-;SKX-LABEL: zext_2x8mem_to_2x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovq2m %xmm0, %k1
-;SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_2x8mem_to_2x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_2x8mem_to_2x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
%x = zext <2 x i8> %a to <2 x i64>
%ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
ret <2 x i64> %ret
}
-;SKX-LABEL: sext_2x8mem_to_2x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovq2m %xmm0, %k1
-;SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_2x8mem_to_2x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpmovsxbq (%rdi), %xmm1
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_2x8mem_to_2x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
%x = sext <2 x i8> %a to <2 x i64>
%ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
ret <2 x i64> %ret
}
-;SKX-LABEL: sext_2x8mem_to_2x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxbq (%rdi), %xmm0
-;SKX-NEXT: retq
define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone {
+; ALL-LABEL: sext_2x8mem_to_2x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbq (%rdi), %xmm0
+; ALL-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
%x = sext <2 x i8> %a to <2 x i64>
ret <2 x i64> %x
}
-;SKX-LABEL: zext_4x8mem_to_4x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x8mem_to_4x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4x8mem_to_4x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
%x = zext <4 x i8> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
ret <4 x i64> %ret
}
-;SKX-LABEL: sext_4x8mem_to_4x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x8mem_to_4x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT: vpmovsxbq (%rdi), %ymm1
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_4x8mem_to_4x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
%x = sext <4 x i8> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
ret <4 x i64> %ret
}
-;SKX-LABEL: sext_4x8mem_to_4x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxbq (%rdi), %ymm0
-;SKX-NEXT: retq
define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x8mem_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbq (%rdi), %ymm0
+; ALL-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
%x = sext <4 x i8> %a to <4 x i64>
ret <4 x i64> %x
}
-;KNL-LABEL: zext_8x8mem_to_8x64:
-;KNL: vpmovzxbq (%rdi), %zmm0 {%k1} {z}
-;KNL-NEXT: retq
define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x8mem_to_8x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x8mem_to_8x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = zext <8 x i8> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;KNL-LABEL: sext_8x8mem_to_8x64mask:
-;KNL: vpmovsxbq (%rdi), %zmm0 {%k1} {z}
-;KNL-NEXT: retq
define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x8mem_to_8x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8x8mem_to_8x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = sext <8 x i8> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;KNL-LABEL: sext_8x8mem_to_8x64:
-;KNL: vpmovsxbq (%rdi), %zmm0
-;KNL-NEXT: retq
define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x8mem_to_8x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbq (%rdi), %zmm0
+; ALL-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = sext <8 x i8> %a to <8 x i64>
ret <8 x i64> %x
}
-;SKX-LABEL: zext_4x16mem_to_4x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x16mem_to_4x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4x16mem_to_4x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = zext <4 x i16> %a to <4 x i32>
%ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
ret <4 x i32> %ret
}
-;SKX-LABEL: sext_4x16mem_to_4x32mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x16mem_to_4x32mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxwd (%rdi), %xmm1
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_4x16mem_to_4x32mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = sext <4 x i16> %a to <4 x i32>
%ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
ret <4 x i32> %ret
}
-;SKX-LABEL: sext_4x16mem_to_4x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxwd (%rdi), %xmm0
-;SKX-NEXT: retq
define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x16mem_to_4x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwd (%rdi), %xmm0
+; ALL-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = sext <4 x i16> %a to <4 x i32>
ret <4 x i32> %x
}
-;SKX-LABEL: zext_8x16mem_to_8x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovzxwd (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16mem_to_8x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x16mem_to_8x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovzxwd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = zext <8 x i16> %a to <8 x i32>
%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
ret <8 x i32> %ret
}
-;SKX-LABEL: sext_8x16mem_to_8x32mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x16mem_to_8x32mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxwd (%rdi), %ymm0
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8x16mem_to_8x32mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = sext <8 x i16> %a to <8 x i32>
%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
ret <8 x i32> %ret
}
-;SKX-LABEL: sext_8x16mem_to_8x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxwd (%rdi), %ymm0
-;SKX-NEXT: retq
define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x16mem_to_8x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwd (%rdi), %ymm0
+; ALL-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = sext <8 x i16> %a to <8 x i32>
ret <8 x i32> %x
}
-;SKX-LABEL: zext_8x16_to_8x32mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm1, %k1
-;SKX-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16_to_8x32mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x16_to_8x32mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT: vpmovw2m %xmm1, %k1
+; SKX-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%x = zext <8 x i16> %a to <8 x i32>
%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
ret <8 x i32> %ret
}
-;SKX-LABEL: zext_8x16_to_8x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovzxwd %xmm0, %ymm0
-;SKX-NEXT: retq
define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
+; KNL-LABEL: zext_8x16_to_8x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x16_to_8x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovzxwd %xmm0, %ymm0
+; SKX-NEXT: retq
%x = zext <8 x i16> %a to <8 x i32>
ret <8 x i32> %x
}
-;SKX-LABEL: zext_16x16mem_to_16x32:
-;KNL-LABEL: zext_16x16mem_to_16x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %xmm0, %k1
-;SKX-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
-;KNL: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x16mem_to_16x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16x16mem_to_16x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <16 x i16>,<16 x i16> *%i,align 1
%x = zext <16 x i16> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %ret
}
-;SKX-LABEL: sext_16x16mem_to_16x32mask:
-;KNL-LABEL: sext_16x16mem_to_16x32mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %xmm0, %k1
-;SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z}
-;KNL: vpmovsxwd (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_16x16mem_to_16x32mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_16x16mem_to_16x32mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <16 x i16>,<16 x i16> *%i,align 1
%x = sext <16 x i16> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %ret
}
-;SKX-LABEL: sext_16x16mem_to_16x32:
-;KNL-LABEL: sext_16x16mem_to_16x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxwd (%rdi), %zmm0
-;KNL: vpmovsxwd (%rdi), %zmm0
-;SKX-NEXT: retq
define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_16x16mem_to_16x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwd (%rdi), %zmm0
+; ALL-NEXT: retq
%a = load <16 x i16>,<16 x i16> *%i,align 1
%x = sext <16 x i16> %a to <16 x i32>
ret <16 x i32> %x
}
-;SKX-LABEL: zext_16x16_to_16x32mask:
-;KNL-LABEL: zext_16x16_to_16x32mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovb2m %xmm1, %k1
-;SKX-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
-;KNL: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_16x16_to_16x32mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16x16_to_16x32mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT: vpmovb2m %xmm1, %k1
+; SKX-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%x = zext <16 x i16> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
ret <16 x i32> %ret
}
-;SKX-LABEL: zext_16x16_to_16x32:
-;KNL-LABEL: zext_16x16_to_16x32:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovzxwd %ymm0, %zmm0
-;KNL: vpmovzxwd %ymm0, %zmm0
-;SKX-NEXT: retq
define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
+; ALL-LABEL: zext_16x16_to_16x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd %ymm0, %zmm0
+; ALL-NEXT: retq
%x = zext <16 x i16> %a to <16 x i32>
ret <16 x i32> %x
}
-;SKX-LABEL: zext_2x16mem_to_2x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovq2m %xmm0, %k1
-;SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_2x16mem_to_2x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_2x16mem_to_2x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
%x = zext <2 x i16> %a to <2 x i64>
%ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
ret <2 x i64> %ret
}
-;SKX-LABEL: sext_2x16mem_to_2x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovq2m %xmm0, %k1
-;SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_2x16mem_to_2x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpmovsxwq (%rdi), %xmm1
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_2x16mem_to_2x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
%x = sext <2 x i16> %a to <2 x i64>
%ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
ret <2 x i64> %ret
}
-;SKX-LABEL: sext_2x16mem_to_2x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxwq (%rdi), %xmm0
-;SKX-NEXT: retq
define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_2x16mem_to_2x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwq (%rdi), %xmm0
+; ALL-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
%x = sext <2 x i16> %a to <2 x i64>
ret <2 x i64> %x
}
-;SKX-LABEL: zext_4x16mem_to_4x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x16mem_to_4x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4x16mem_to_4x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = zext <4 x i16> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
ret <4 x i64> %ret
}
-;SKX-LABEL: sext_4x16mem_to_4x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x16mem_to_4x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT: vpmovsxwq (%rdi), %ymm1
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_4x16mem_to_4x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = sext <4 x i16> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
ret <4 x i64> %ret
}
-;SKX-LABEL: sext_4x16mem_to_4x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxwq (%rdi), %ymm0
-;SKX-NEXT: retq
define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x16mem_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwq (%rdi), %ymm0
+; ALL-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = sext <4 x i16> %a to <4 x i64>
ret <4 x i64> %x
}
-;SKX-LABEL: zext_8x16mem_to_8x64:
-;KNL-LABEL: zext_8x16mem_to_8x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
-;KNL: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16mem_to_8x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x16mem_to_8x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = zext <8 x i16> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;SKX-LABEL: sext_8x16mem_to_8x64mask:
-;KNL-LABEL: sext_8x16mem_to_8x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z}
-;KNL: vpmovsxwq (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x16mem_to_8x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8x16mem_to_8x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = sext <8 x i16> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;SKX-LABEL: sext_8x16mem_to_8x64:
-;KNL-LABEL: sext_8x16mem_to_8x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxwq (%rdi), %zmm0
-;KNL: vpmovsxwq (%rdi), %zmm0
-;SKX-NEXT: retq
define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x16mem_to_8x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwq (%rdi), %zmm0
+; ALL-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = sext <8 x i16> %a to <8 x i64>
ret <8 x i64> %x
}
-;SKX-LABEL: zext_8x16_to_8x64mask:
-;KNL-LABEL: zext_8x16_to_8x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm1, %k1
-;SKX-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
-;KNL: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x16_to_8x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x16_to_8x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT: vpmovw2m %xmm1, %k1
+; SKX-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%x = zext <8 x i16> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;SKX-LABEL: zext_8x16_to_8x64:
-;KNL-LABEL: zext_8x16_to_8x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovzxwq %xmm0, %zmm0
-;KNL: vpmovzxwq %xmm0, %zmm0
-;SKX-NEXT: retq
-; KNL: ret
define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
+; ALL-LABEL: zext_8x16_to_8x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwq %xmm0, %zmm0
+; ALL-NEXT: retq
%ret = zext <8 x i16> %a to <8 x i64>
ret <8 x i64> %ret
}
-;SKX-LABEL: zext_2x32mem_to_2x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovq2m %xmm0, %k1
-;SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_2x32mem_to_2x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_2x32mem_to_2x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
%x = zext <2 x i32> %a to <2 x i64>
%ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
ret <2 x i64> %ret
}
-;SKX-LABEL: sext_2x32mem_to_2x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovq2m %xmm0, %k1
-;SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z}
-;SKX-NEXT: retq
define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_2x32mem_to_2x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpmovsxdq (%rdi), %xmm1
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_2x32mem_to_2x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
%x = sext <2 x i32> %a to <2 x i64>
%ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
ret <2 x i64> %ret
}
-;SKX-LABEL: sext_2x32mem_to_2x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxdq (%rdi), %xmm0
-;SKX-NEXT: retq
define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone {
+; ALL-LABEL: sext_2x32mem_to_2x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxdq (%rdi), %xmm0
+; ALL-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
%x = sext <2 x i32> %a to <2 x i64>
ret <2 x i64> %x
}
-;SKX-LABEL: zext_4x32mem_to_4x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x32mem_to_4x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4x32mem_to_4x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
%x = zext <4 x i32> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
ret <4 x i64> %ret
}
-;SKX-LABEL: sext_4x32mem_to_4x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm0, %k1
-;SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_4x32mem_to_4x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
+; KNL-NEXT: vpmovsxdq (%rdi), %ymm1
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_4x32mem_to_4x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
%x = sext <4 x i32> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
ret <4 x i64> %ret
}
-;SKX-LABEL: sext_4x32mem_to_4x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxdq (%rdi), %ymm0
-;SKX-NEXT: retq
define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone {
+; ALL-LABEL: sext_4x32mem_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxdq (%rdi), %ymm0
+; ALL-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
%x = sext <4 x i32> %a to <4 x i64>
ret <4 x i64> %x
}
-;SKX-LABEL: sext_4x32_to_4x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxdq %xmm0, %ymm0
-;SKX-NEXT: retq
define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone {
+; ALL-LABEL: sext_4x32_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxdq %xmm0, %ymm0
+; ALL-NEXT: retq
%x = sext <4 x i32> %a to <4 x i64>
ret <4 x i64> %x
}
-;SKX-LABEL: zext_4x32_to_4x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovd2m %xmm1, %k1
-;SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z}
-;SKX-NEXT: retq
define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_4x32_to_4x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4x32_to_4x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%x = zext <4 x i32> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
ret <4 x i64> %ret
}
-;SKX-LABEL: zext_8x32mem_to_8x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x32mem_to_8x64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x32mem_to_8x64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i32>,<8 x i32> *%i,align 1
%x = zext <8 x i32> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;SKX-LABEL: sext_8x32mem_to_8x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm0, %k1
-;SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: sext_8x32mem_to_8x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8x32mem_to_8x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = load <8 x i32>,<8 x i32> *%i,align 1
%x = sext <8 x i32> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;SKX-LABEL: sext_8x32mem_to_8x64:
-;KNL-LABEL: sext_8x32mem_to_8x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxdq (%rdi), %zmm0
-;KNL: vpmovsxdq (%rdi), %zmm0
-;SKX-NEXT: retq
define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
+; ALL-LABEL: sext_8x32mem_to_8x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxdq (%rdi), %zmm0
+; ALL-NEXT: retq
%a = load <8 x i32>,<8 x i32> *%i,align 1
%x = sext <8 x i32> %a to <8 x i64>
ret <8 x i64> %x
}
-;SKX-LABEL: sext_8x32_to_8x64:
-;KNL-LABEL: sext_8x32_to_8x64:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovsxdq %ymm0, %zmm0
-;KNL: vpmovsxdq %ymm0, %zmm0
-;SKX-NEXT: retq
define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
+; ALL-LABEL: sext_8x32_to_8x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxdq %ymm0, %zmm0
+; ALL-NEXT: retq
%x = sext <8 x i32> %a to <8 x i64>
ret <8 x i64> %x
}
-;SKX-LABEL: zext_8x32_to_8x64mask:
-;KNL-LABEL: zext_8x32_to_8x64mask:
-;SKX: ## BB#0:
-;SKX-NEXT: vpmovw2m %xmm1, %k1
-;SKX-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
-;KNL: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
-;SKX-NEXT: retq
define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone {
+; KNL-LABEL: zext_8x32_to_8x64mask:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
+; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8x32_to_8x64mask:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT: vpmovw2m %xmm1, %k1
+; SKX-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%x = zext <8 x i32> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
ret <8 x i64> %ret
}
-;KNL-LABEL: fptrunc_test
-;KNL: vcvtpd2ps {{.*}}%zmm
-;KNL: ret
define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
+; ALL-LABEL: fptrunc_test:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0
+; ALL-NEXT: retq
%b = fptrunc <8 x double> %a to <8 x float>
ret <8 x float> %b
}
-;KNL-LABEL: fpext_test
-;KNL: vcvtps2pd {{.*}}%zmm
-;KNL: ret
define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
+; ALL-LABEL: fpext_test:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%b = fpext <8 x float> %a to <8 x double>
ret <8 x double> %b
}
-; KNL-LABEL: zext_16i1_to_16xi32
-; KNL: vpbroadcastd LCP{{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL: ret
define <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
+; ALL-LABEL: zext_16i1_to_16xi32:
+; ALL: ## BB#0:
+; ALL-NEXT: kmovw %edi, %k1
+; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; ALL-NEXT: retq
%a = bitcast i16 %b to <16 x i1>
%c = zext <16 x i1> %a to <16 x i32>
ret <16 x i32> %c
}
-; KNL-LABEL: zext_8i1_to_8xi64
-; KNL: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL: ret
define <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
+; KNL-LABEL: zext_8i1_to_8xi64:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl %dil, %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_8i1_to_8xi64:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb %edi, %k1
+; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = bitcast i8 %b to <8 x i1>
%c = zext <8 x i1> %a to <8 x i64>
ret <8 x i64> %c
}
-; KNL-LABEL: trunc_16i8_to_16i1
-; KNL: vpmovsxbd
-; KNL: vpandd
-; KNL: vptestmd
-; KNL: ret
-; SKX-LABEL: trunc_16i8_to_16i1
-; SKX: vpmovb2m %xmm
define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
+; KNL-LABEL: trunc_16i8_to_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_16i8_to_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: retq
%mask_b = trunc <16 x i8>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
ret i16 %mask
}
-; KNL-LABEL: trunc_16i32_to_16i1
-; KNL: vpandd
-; KNL: vptestmd
-; KNL: ret
-; SKX-LABEL: trunc_16i32_to_16i1
-; SKX: vpmovd2m %zmm
define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
+; KNL-LABEL: trunc_16i32_to_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_16i32_to_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %zmm0, %zmm0
+; SKX-NEXT: vpmovd2m %zmm0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: retq
%mask_b = trunc <16 x i32>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
ret i16 %mask
}
-; SKX-LABEL: trunc_4i32_to_4i1
-; SKX: vpmovd2m %xmm
-; SKX: kandw
-; SKX: vpmovm2d
define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
+; KNL-LABEL: trunc_4i32_to_4i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_4i32_to_4i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vpslld $31, %xmm1, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
%mask_a = trunc <4 x i32>%a to <4 x i1>
%mask_b = trunc <4 x i32>%b to <4 x i1>
%a_and_b = and <4 x i1>%mask_a, %mask_b
ret <4 x i32>%res
}
-; KNL-LABEL: trunc_8i16_to_8i1
-; KNL: vpmovsxwq
-; KNL: vpandq LCP{{.*}}(%rip){1to8}
-; KNL: vptestmq
-; KNL: ret
-; SKX-LABEL: trunc_8i16_to_8i1
-; SKX: vpmovw2m %xmm
define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
+; KNL-LABEL: trunc_8i16_to_8i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_8i16_to_8i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: retq
%mask_b = trunc <8 x i16>%a to <8 x i1>
%mask = bitcast <8 x i1> %mask_b to i8
ret i8 %mask
}
-; KNL-LABEL: sext_8i1_8i32
-; KNL: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z}
-; SKX: vpmovm2d
-; KNL: ret
define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; KNL-LABEL: sext_8i1_8i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT: knotw %k0, %k1
+; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8i1_8i32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT: knotb %k0, %k0
+; SKX-NEXT: vpmovm2d %k0, %ymm0
+; SKX-NEXT: retq
%x = icmp slt <8 x i32> %a1, %a2
%x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
%y = sext <8 x i1> %x1 to <8 x i32>
}
-; KNL-LABEL: trunc_i32_to_i1
-; KNL: movw $-4, %ax
-; KNL: kmovw %eax, %k1
-; KNL: korw
define i16 @trunc_i32_to_i1(i32 %a) {
+; ALL-LABEL: trunc_i32_to_i1:
+; ALL: ## BB#0:
+; ALL-NEXT: andl $1, %edi
+; ALL-NEXT: kmovw %edi, %k0
+; ALL-NEXT: movw $-4, %ax
+; ALL-NEXT: kmovw %eax, %k1
+; ALL-NEXT: korw %k0, %k1, %k0
+; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: retq
%a_i = trunc i32 %a to i1
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
%res = bitcast <16 x i1> %maskv to i16
ret i16 %res
}
-; KNL-LABEL: sext_8i1_8i16
-; SKX: vpmovm2w
-; KNL: ret
define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; KNL-LABEL: sext_8i1_8i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8i1_8i16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: retq
%x = icmp slt <8 x i32> %a1, %a2
%y = sext <8 x i1> %x to <8 x i16>
ret <8 x i16> %y
}
-; KNL-LABEL: sext_16i1_16i32
-; SKX: vpmovm2d
-; KNL: ret
define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
+; KNL-LABEL: sext_16i1_16i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_16i1_16i32:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
+; SKX-NEXT: vpmovm2d %k0, %zmm0
+; SKX-NEXT: retq
%x = icmp slt <16 x i32> %a1, %a2
%y = sext <16 x i1> %x to <16 x i32>
ret <16 x i32> %y
}
-; KNL-LABEL: sext_8i1_8i64
-; SKX: vpmovm2q
-; KNL: ret
define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; KNL-LABEL: sext_8i1_8i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sext_8i1_8i64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT: vpmovm2q %k0, %zmm0
+; SKX-NEXT: retq
%x = icmp slt <8 x i32> %a1, %a2
%y = sext <8 x i1> %x to <8 x i64>
ret <8 x i64> %y
}
-; KNL-LABEL: @extload_v8i64
-; KNL: vpmovsxbq
define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
+; ALL-LABEL: extload_v8i64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbq (%rdi), %zmm0
+; ALL-NEXT: vmovdqa64 %zmm0, (%rsi)
+; ALL-NEXT: retq
%sign_load = load <8 x i8>, <8 x i8>* %a
%c = sext <8 x i8> %sign_load to <8 x i64>
store <8 x i64> %c, <8 x i64>* %res
ret void
}
-;SKX-LABEL: test21:
-;SKX: vmovdqu16 %zmm0, %zmm3 {%k1}
-;SKX-NEXT: kshiftrq $32, %k1, %k1
-;SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1}
define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
+; KNL-LABEL: test21:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: pushq %r15
+; KNL-NEXT: pushq %r14
+; KNL-NEXT: pushq %r13
+; KNL-NEXT: pushq %r12
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: vpmovsxbd %xmm7, %zmm7
+; KNL-NEXT: vpslld $31, %zmm7, %zmm7
+; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT: vpslld $31, %zmm6, %zmm6
+; KNL-NEXT: vpmovsxbd %xmm5, %zmm5
+; KNL-NEXT: vpslld $31, %zmm5, %zmm5
+; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vpslld $31, %zmm4, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %eax, %xmm4
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $3, %edi, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %edi
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $4, %esi, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $5, %r13d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $6, %r8d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $7, %r10d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $8, %r11d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $9, %ebx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $11, %r14d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $12, %r15d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %r9d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $14, %r12d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k0
+; KNL-NEXT: kshiftlw $0, %k1, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vmovd %ecx, %xmm5
+; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $2, %edi, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $5, %r8d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $6, %r10d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $7, %r11d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $8, %ebx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $9, %ebp, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $10, %r14d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $11, %r15d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $13, %r12d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $14, %r9d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $15, %edx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: vptestmd %zmm7, %zmm7, %k1
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %eax, %xmm6
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $2, %edi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $5, %r13d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k0, %edi
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $7, %ebx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $8, %ebp, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $9, %r10d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $10, %r11d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $11, %esi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %r9d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $14, %r15d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $15, %r12d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $0, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %edx, %xmm7
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vpinsrb $1, %eax, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $3, %r8d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $5, %edi, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $6, %ebx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $7, %ebp, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $9, %r11d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $10, %esi, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $11, %r14d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $12, %r9d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $13, %r15d, %xmm7, %xmm7
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2
+; KNL-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm4
+; KNL-NEXT: vpinsrb $15, %edx, %xmm4, %xmm4
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
+; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
+; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: popq %r12
+; KNL-NEXT: popq %r13
+; KNL-NEXT: popq %r14
+; KNL-NEXT: popq %r15
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test21:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %zmm2, %zmm2
+; SKX-NEXT: vpmovb2m %zmm2, %k1
+; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; SKX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
+; SKX-NEXT: kshiftrq $32, %k1, %k1
+; SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: retq
%ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
ret <64 x i16> %ret
-}
+}
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; CHECK-LABEL: test_x86_fmadd_ps_z
-; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0
-; CHECK: ret
define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fmadd_ps_z:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
%res = fadd <16 x float> %x, %a2
ret <16 x float> %res
}
-; CHECK-LABEL: test_x86_fmsub_ps_z
-; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0
-; CHECK: ret
define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fmsub_ps_z:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
%res = fsub <16 x float> %x, %a2
ret <16 x float> %res
}
-; CHECK-LABEL: test_x86_fnmadd_ps_z
-; CHECK: vfnmadd213ps %zmm2, %zmm1, %zmm0
-; CHECK: ret
define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fnmadd_ps_z:
+; ALL: ## BB#0:
+; ALL-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
%res = fsub <16 x float> %a2, %x
ret <16 x float> %res
}
-; CHECK-LABEL: test_x86_fnmsub_ps_z
-; CHECK: vfnmsub213ps %zmm2, %zmm1, %zmm0
-; CHECK: ret
define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; ALL-LABEL: test_x86_fnmsub_ps_z:
+; ALL: ## BB#0:
+; ALL-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
- %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+ %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
- float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
- float -0.000000e+00>, %x
+ float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+ float -0.000000e+00>, %x
%res = fsub <16 x float> %y, %a2
ret <16 x float> %res
}
-; CHECK-LABEL: test_x86_fmadd_pd_z
-; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0
-; CHECK: ret
define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; ALL-LABEL: test_x86_fmadd_pd_z:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
+; ALL-NEXT: retq
%x = fmul <8 x double> %a0, %a1
%res = fadd <8 x double> %x, %a2
ret <8 x double> %res
}
-; CHECK-LABEL: test_x86_fmsub_pd_z
-; CHECK: vfmsub213pd %zmm2, %zmm1, %zmm0
-; CHECK: ret
define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; ALL-LABEL: test_x86_fmsub_pd_z:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmsub213pd %zmm2, %zmm1, %zmm0
+; ALL-NEXT: retq
%x = fmul <8 x double> %a0, %a1
%res = fsub <8 x double> %x, %a2
ret <8 x double> %res
}
define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
-; CHECK-LABEL: test_x86_fmsub_213:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
+; ALL-LABEL: test_x86_fmsub_213:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
+; ALL-NEXT: vmovaps %zmm1, %zmm0
+; ALL-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
ret double %res
}
define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
-; CHECK-LABEL: test_x86_fmsub_213_m:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: test_x86_fmsub_213_m:
+; KNL: ## BB#0:
+; KNL-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmsub_213_m:
+; SKX: ## BB#0:
+; SKX-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0
+; SKX-NEXT: retq
%a2 = load double , double *%a2_ptr
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
}
define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
-; CHECK-LABEL: test_x86_fmsub_231_m:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
+; ALL-LABEL: test_x86_fmsub_231_m:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; ALL-NEXT: vmovaps %zmm1, %zmm0
+; ALL-NEXT: retq
%a2 = load double , double *%a2_ptr
%x = fmul double %a0, %a2
%res = fsub double %x, %a1
}
define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
-; CHECK-LABEL: test231_br:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
+; ALL-LABEL: test231_br:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
+; ALL-NEXT: vmovaps %zmm1, %zmm0
+; ALL-NEXT: retq
%b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
%b2 = fadd <16 x float> %b1, %a2
ret <16 x float> %b2
}
define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
-; CHECK-LABEL: test213_br:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; ALL-LABEL: test213_br:
+; ALL: ## BB#0:
+; ALL-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
+; ALL-NEXT: retq
%b1 = fmul <16 x float> %a1, %a2
%b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
ret <16 x float> %b2
;mask (a*c+b , a)
define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
-; CHECK-LABEL: test_x86_fmadd132_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
-; CHECK-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT: retq
+; KNL-LABEL: test_x86_fmadd132_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
;
; SKX-LABEL: test_x86_fmadd132_ps:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
;mask (a*c+b , b)
define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
-; CHECK-LABEL: test_x86_fmadd231_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
-; CHECK-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: test_x86_fmadd231_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
;
; SKX-LABEL: test_x86_fmadd231_ps:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
;mask (b*a+c , b)
define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
-; CHECK-LABEL: test_x86_fmadd213_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
-; CHECK-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: test_x86_fmadd213_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
;
; SKX-LABEL: test_x86_fmadd213_ps:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL --check-prefix=CHECK
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; CHECK-LABEL: mask16
-; CHECK: kmovw
-; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw
define i16 @mask16(i16 %x) {
+; CHECK-LABEL: mask16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <16 x i1> %m1 to i16
ret i16 %ret
}
-; CHECK-LABEL: mask8
-; KNL: kmovw
-; KNL-NEXT: knotw
-; KNL-NEXT: kmovw
-; SKX: kmovb
-; SKX-NEXT: knotb
-; SKX-NEXT: kmovb
-
define i8 @mask8(i8 %x) {
+; KNL-LABEL: mask8:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl %dil, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: mask8:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb %edi, %k0
+; SKX-NEXT: knotb %k0, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
ret i8 %ret
}
-; CHECK-LABEL: mask16_mem
-; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
-; CHECK-NEXT: knotw
-; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
-; CHECK: ret
-
define void @mask16_mem(i16* %ptr) {
+; CHECK-LABEL: mask16_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw (%rdi), %k0
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: kmovw %k0, (%rdi)
+; CHECK-NEXT: retq
%x = load i16, i16* %ptr, align 4
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
ret void
}
-; CHECK-LABEL: mask8_mem
-; KNL: kmovw ([[ARG1]]), %k{{[0-7]}}
-; KNL-NEXT: knotw
-; KNL-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
-; SKX: kmovb ([[ARG1]]), %k{{[0-7]}}
-; SKX-NEXT: knotb
-; SKX-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
-
define void @mask8_mem(i8* %ptr) {
+; KNL-LABEL: mask8_mem:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw (%rdi), %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: mask8_mem:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: knotb %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
%x = load i8, i8* %ptr, align 4
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
ret void
}
-; CHECK-LABEL: mand16
-; CHECK: kandw
-; CHECK: kxorw
-; CHECK: korw
define i16 @mand16(i16 %x, i16 %y) {
+; CHECK-LABEL: mand16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: kandw %k1, %k0, %k2
+; CHECK-NEXT: kxorw %k1, %k0, %k0
+; CHECK-NEXT: korw %k0, %k2, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
%ma = bitcast i16 %x to <16 x i1>
%mb = bitcast i16 %y to <16 x i1>
%mc = and <16 x i1> %ma, %mb
ret i16 %ret
}
-; CHECK-LABEL: shuf_test1
-; CHECK: kshiftrw $8
define i8 @shuf_test1(i16 %v) nounwind {
+; KNL-LABEL: shuf_test1:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: kshiftrw $8, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuf_test1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovw %edi, %k0
+; SKX-NEXT: kshiftrw $8, %k0, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: retq
%v1 = bitcast i16 %v to <16 x i1>
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask1 = bitcast <8 x i1> %mask to i8
ret i8 %mask1
}
-; CHECK-LABEL: zext_test1
-; CHECK: kshiftlw
-; CHECK: kshiftrw
-; CHECK: kmovw
-
define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: kshiftlw $10, %k0, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i32
ret i32 %res
-}
-
-; CHECK-LABEL: zext_test2
-; CHECK: kshiftlw
-; CHECK: kshiftrw
-; CHECK: kmovw
-
-define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+}define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i16
ret i16 %res
-}
-
-; CHECK-LABEL: zext_test3
-; CHECK: kshiftlw
-; CHECK: kshiftrw
-; CHECK: kmovw
-
-define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+}define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i8
ret i8 %res
}
-; CHECK-LABEL: conv1
-; KNL: kmovw %k0, %eax
-; KNL: movb %al, (%rdi)
-; SKX: kmovb %k0, (%rdi)
define i8 @conv1(<8 x i1>* %R) {
+; KNL-LABEL: conv1:
+; KNL: ## BB#0: ## %entry
+; KNL-NEXT: kxnorw %k0, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: movb $-2, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: movb $-2, %al
+; KNL-NEXT: retq
+;
+; SKX-LABEL: conv1:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: kxnorw %k0, %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: movb $-2, %al
+; SKX-NEXT: retq
entry:
store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
ret i8 %mask_convert
}
-; SKX-LABEL: test4
-; SKX: vpcmpgt
-; SKX: knot
-; SKX: vpcmpgt
-; SKX: vpmovm2d
define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) {
+; KNL-LABEL: test4:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: vpslld $31, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1
+; KNL-NEXT: vpmovqd %zmm1, %ymm1
+; KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test4:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k0
+; SKX-NEXT: knotw %k0, %k1
+; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
%x_gt_y = icmp sgt <4 x i64> %x, %y
%x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
%res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1
ret <4 x i32> %resse
}
-; SKX-LABEL: test5
-; SKX: vpcmpgt
-; SKX: knot
-; SKX: vpcmpgt
-; SKX: vpmovm2q
define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) {
+; KNL-LABEL: test5:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
+; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test5:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0
+; SKX-NEXT: knotw %k0, %k1
+; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; SKX-NEXT: vpmovm2q %k0, %xmm0
+; SKX-NEXT: retq
%x_gt_y = icmp slt <2 x i64> %x, %y
%x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1
%res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1
%resse = sext <2 x i1>%res to <2 x i64>
ret <2 x i64> %resse
-}
-
-; KNL-LABEL: test6
-; KNL: vpmovsxbd
-; KNL: vpandd
-; KNL: kmovw %eax, %k1
-; KNL vptestmd {{.*}}, %k0 {%k1}
-
-; SKX-LABEL: test6
-; SKX: vpmovb2m
-; SKX: kmovw %eax, %k1
-; SKX: kandw
-define void @test6(<16 x i1> %mask) {
+}define void @test6(<16 x i1> %mask) {
allocas:
%a= and <16 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
%b = bitcast <16 x i1> %a to i16
false:
ret void
}
-
-; KNL-LABEL: test7
-; KNL: vpmovsxwq
-; KNL: vpandq
-; KNL: vptestmq {{.*}}, %k0
-; KNL: korw
-
-; SKX-LABEL: test7
-; SKX: vpmovw2m
-; SKX: kmovb %eax, %k1
-; SKX: korb
-
define void @test7(<8 x i1> %mask) {
+; KNL-LABEL: test7:
+; KNL: ## BB#0: ## %allocas
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: movb $85, %al
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: korw %k1, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test7:
+; SKX: ## BB#0: ## %allocas
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: movb $85, %al
+; SKX-NEXT: kmovb %eax, %k1
+; SKX-NEXT: korb %k1, %k0, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: retq
allocas:
%a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
%b = bitcast <8 x i1> %a to i8
false:
ret void
}
-
-; KNL-LABEL: test8
-; KNL: vpxord %zmm2, %zmm2, %zmm2
-; KNL: jg
-; KNL: vpcmpltud %zmm2, %zmm1, %k1
-; KNL: jmp
-; KNL: vpcmpgtd %zmm2, %zmm0, %k1
-
-; SKX-LABEL: test8
-; SKX: jg
-; SKX: vpcmpltud {{.*}}, %k0
-; SKX: vpmovm2b
-; SKX: vpcmpgtd {{.*}}, %k0
-; SKX: vpmovm2b
-
define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
+; KNL-LABEL: test8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: jg LBB14_1
+; KNL-NEXT: ## BB#2:
+; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1
+; KNL-NEXT: jmp LBB14_3
+; KNL-NEXT: LBB14_1:
+; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; KNL-NEXT: LBB14_3:
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: jg LBB14_1
+; SKX-NEXT: ## BB#2:
+; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: retq
+; SKX-NEXT: LBB14_1:
+; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: retq
%cond = icmp sgt i32 %a1, %b1
%cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
%cmp2 = icmp ult <16 x i32> %b, zeroinitializer
%res = sext <16 x i1> %mix to <16 x i8>
ret <16 x i8> %res
}
-
-; KNL-LABEL: test9
-; KNL: jg
-; KNL: vpmovsxbd %xmm1, %zmm0
-; KNL: jmp
-; KNL: vpmovsxbd %xmm0, %zmm0
-
-; SKX-LABEL: test9
-; SKX: vpmovb2m %xmm1, %k0
-; SKX: vpmovm2b %k0, %xmm0
-; SKX: retq
-; SKX: vpmovb2m %xmm0, %k0
-; SKX: vpmovm2b %k0, %xmm0
-
define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
+; KNL-LABEL: test9:
+; KNL: ## BB#0:
+; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: jg LBB15_1
+; KNL-NEXT: ## BB#2:
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT: jmp LBB15_3
+; KNL-NEXT: LBB15_1:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: LBB15_3:
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test9:
+; SKX: ## BB#0:
+; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: jg LBB15_1
+; SKX-NEXT: ## BB#2:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm0
+; SKX-NEXT: jmp LBB15_3
+; SKX-NEXT: LBB15_1:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: LBB15_3:
+; SKX-NEXT: vpmovb2m %xmm0, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: retq
%mask = icmp sgt i32 %a1, %b1
%c = select i1 %mask, <16 x i1>%a, <16 x i1>%b
ret <16 x i1>%c
-}
-
-; KNL-LABEL: test10
-; KNL: jg
-; KNL: vpmovsxwq %xmm1, %zmm0
-; KNL: jmp
-; KNL: vpmovsxwq %xmm0, %zmm0
-
-; SKX-LABEL: test10
-; SKX: jg
-; SKX: vpmovw2m %xmm1, %k0
-; SKX: vpmovm2w %k0, %xmm0
-; SKX: retq
-; SKX: vpmovw2m %xmm0, %k0
-; SKX: vpmovm2w %k0, %xmm0
-define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
+}define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
%mask = icmp sgt i32 %a1, %b1
%c = select i1 %mask, <8 x i1>%a, <8 x i1>%b
ret <8 x i1>%c
}
-; SKX-LABEL: test11
-; SKX: jg
-; SKX: vpmovd2m %xmm1, %k0
-; SKX: vpmovm2d %k0, %xmm0
-; SKX: retq
-; SKX: vpmovd2m %xmm0, %k0
-; SKX: vpmovm2d %k0, %xmm0
define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
+; KNL-LABEL: test11:
+; KNL: ## BB#0:
+; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: jg LBB17_2
+; KNL-NEXT: ## BB#1:
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: LBB17_2:
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test11:
+; SKX: ## BB#0:
+; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: jg LBB17_1
+; SKX-NEXT: ## BB#2:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm0
+; SKX-NEXT: jmp LBB17_3
+; SKX-NEXT: LBB17_1:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: LBB17_3:
+; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
%mask = icmp sgt i32 %a1, %b1
%c = select i1 %mask, <4 x i1>%a, <4 x i1>%b
ret <4 x i1>%c
}
-; KNL-LABEL: test12
-; KNL: movl %edi, %eax
define i32 @test12(i32 %x, i32 %y) {
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%a = bitcast i16 21845 to <16 x i1>
%b = extractelement <16 x i1> %a, i32 0
%c = select i1 %b, i32 %x, i32 %y
ret i32 %c
}
-; KNL-LABEL: test13
-; KNL: movl %esi, %eax
define i32 @test13(i32 %x, i32 %y) {
+; CHECK-LABEL: test13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
%a = bitcast i16 21845 to <16 x i1>
%b = extractelement <16 x i1> %a, i32 3
%c = select i1 %b, i32 %x, i32 %y
ret i32 %c
-}
-
-; SKX-LABEL: test14
-; SKX: movb $11, %al
-; SKX: kmovb %eax, %k0
-; SKX: vpmovm2d %k0, %xmm0
-
-define <4 x i1> @test14() {
+}define <4 x i1> @test14() {
%a = bitcast i16 21845 to <16 x i1>
%b = extractelement <16 x i1> %a, i32 2
%c = insertelement <4 x i1> <i1 true, i1 false, i1 false, i1 true>, i1 %b, i32 1
ret <4 x i1> %c
}
-; KNL-LABEL: test15
-; KNL: cmovgw
define <16 x i1> @test15(i32 %x, i32 %y) {
+; KNL-LABEL: test15:
+; KNL: ## BB#0:
+; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: movw $21845, %ax ## imm = 0x5555
+; KNL-NEXT: movw $1, %cx
+; KNL-NEXT: cmovgw %ax, %cx
+; KNL-NEXT: kmovw %ecx, %k1
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test15:
+; SKX: ## BB#0:
+; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: movw $21845, %ax ## imm = 0x5555
+; SKX-NEXT: movw $1, %cx
+; SKX-NEXT: cmovgw %ax, %cx
+; SKX-NEXT: kmovw %ecx, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: retq
%a = bitcast i16 21845 to <16 x i1>
%b = bitcast i16 1 to <16 x i1>
%mask = icmp sgt i32 %x, %y
ret <16 x i1> %c
}
-; SKX-LABEL: test16
-; SKX: kxnorw %k0, %k0, %k1
-; SKX: kshiftrw $15, %k1, %k1
-; SKX: kshiftlq $5, %k1, %k1
-; SKX: korq %k1, %k0, %k0
-; SKX: vpmovm2b %k0, %zmm0
define <64 x i8> @test16(i64 %x) {
+; KNL-LABEL: test16:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Ltmp0:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Ltmp1:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Ltmp2:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: pushq %r15
+; KNL-NEXT: pushq %r14
+; KNL-NEXT: pushq %r13
+; KNL-NEXT: pushq %r12
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: Ltmp3:
+; KNL-NEXT: .cfi_offset %rbx, -56
+; KNL-NEXT: Ltmp4:
+; KNL-NEXT: .cfi_offset %r12, -48
+; KNL-NEXT: Ltmp5:
+; KNL-NEXT: .cfi_offset %r13, -40
+; KNL-NEXT: Ltmp6:
+; KNL-NEXT: .cfi_offset %r14, -32
+; KNL-NEXT: Ltmp7:
+; KNL-NEXT: .cfi_offset %r15, -24
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: shrq $32, %rax
+; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl $271, %eax ## imm = 0x10F
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: movl %edi, %ecx
+; KNL-NEXT: andl $1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: movl $257, %ecx ## imm = 0x101
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $258, %ecx ## imm = 0x102
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $259, %ecx ## imm = 0x103
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $260, %ecx ## imm = 0x104
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $261, %ecx ## imm = 0x105
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $262, %ecx ## imm = 0x106
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $263, %ecx ## imm = 0x107
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $264, %ecx ## imm = 0x108
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $265, %ecx ## imm = 0x109
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $266, %ecx ## imm = 0x10A
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $267, %ecx ## imm = 0x10B
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $268, %ecx ## imm = 0x10C
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $269, %ecx ## imm = 0x10D
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT: movl $270, %ecx ## imm = 0x10E
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT: movl $1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d
+; KNL-NEXT: movq %r15, %rdx
+; KNL-NEXT: shrq $17, %rdx
+; KNL-NEXT: andb $1, %dl
+; KNL-NEXT: je LBB22_2
+; KNL-NEXT: ## BB#1:
+; KNL-NEXT: movb $-1, %dl
+; KNL-NEXT: LBB22_2:
+; KNL-NEXT: movq %r15, %r11
+; KNL-NEXT: shrq $16, %r11
+; KNL-NEXT: andb $1, %r11b
+; KNL-NEXT: je LBB22_4
+; KNL-NEXT: ## BB#3:
+; KNL-NEXT: movb $-1, %r11b
+; KNL-NEXT: LBB22_4:
+; KNL-NEXT: movq %r15, %r10
+; KNL-NEXT: shrq $18, %r10
+; KNL-NEXT: andb $1, %r10b
+; KNL-NEXT: je LBB22_6
+; KNL-NEXT: ## BB#5:
+; KNL-NEXT: movb $-1, %r10b
+; KNL-NEXT: LBB22_6:
+; KNL-NEXT: movq %r15, %r9
+; KNL-NEXT: shrq $19, %r9
+; KNL-NEXT: andb $1, %r9b
+; KNL-NEXT: je LBB22_8
+; KNL-NEXT: ## BB#7:
+; KNL-NEXT: movb $-1, %r9b
+; KNL-NEXT: LBB22_8:
+; KNL-NEXT: movq %r15, %rbx
+; KNL-NEXT: shrq $20, %rbx
+; KNL-NEXT: andb $1, %bl
+; KNL-NEXT: je LBB22_10
+; KNL-NEXT: ## BB#9:
+; KNL-NEXT: movb $-1, %bl
+; KNL-NEXT: LBB22_10:
+; KNL-NEXT: movq %r15, %r12
+; KNL-NEXT: shrq $21, %r12
+; KNL-NEXT: andb $1, %r12b
+; KNL-NEXT: je LBB22_12
+; KNL-NEXT: ## BB#11:
+; KNL-NEXT: movb $-1, %r12b
+; KNL-NEXT: LBB22_12:
+; KNL-NEXT: movq %r15, %r14
+; KNL-NEXT: shrq $22, %r14
+; KNL-NEXT: andb $1, %r14b
+; KNL-NEXT: je LBB22_14
+; KNL-NEXT: ## BB#13:
+; KNL-NEXT: movb $-1, %r14b
+; KNL-NEXT: LBB22_14:
+; KNL-NEXT: movq %r15, %r8
+; KNL-NEXT: shrq $23, %r8
+; KNL-NEXT: andb $1, %r8b
+; KNL-NEXT: je LBB22_16
+; KNL-NEXT: ## BB#15:
+; KNL-NEXT: movb $-1, %r8b
+; KNL-NEXT: LBB22_16:
+; KNL-NEXT: movq %r15, %r13
+; KNL-NEXT: shrq $24, %r13
+; KNL-NEXT: andb $1, %r13b
+; KNL-NEXT: je LBB22_18
+; KNL-NEXT: ## BB#17:
+; KNL-NEXT: movb $-1, %r13b
+; KNL-NEXT: LBB22_18:
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $25, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_20
+; KNL-NEXT: ## BB#19:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_20:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $26, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_22
+; KNL-NEXT: ## BB#21:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_22:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movl $272, %esi ## imm = 0x110
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $27, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_24
+; KNL-NEXT: ## BB#23:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_24:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movl $273, %eax ## imm = 0x111
+; KNL-NEXT: bextrl %esi, %edi, %esi
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shrq $28, %rcx
+; KNL-NEXT: andb $1, %cl
+; KNL-NEXT: je LBB22_26
+; KNL-NEXT: ## BB#25:
+; KNL-NEXT: movb $-1, %cl
+; KNL-NEXT: LBB22_26:
+; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vmovd %esi, %xmm2
+; KNL-NEXT: movl $274, %esi ## imm = 0x112
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shrq $29, %rcx
+; KNL-NEXT: andb $1, %cl
+; KNL-NEXT: je LBB22_28
+; KNL-NEXT: ## BB#27:
+; KNL-NEXT: movb $-1, %cl
+; KNL-NEXT: LBB22_28:
+; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT: bextrl %esi, %edi, %eax
+; KNL-NEXT: movzbl %r11b, %esi
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shrq $30, %rcx
+; KNL-NEXT: andb $1, %cl
+; KNL-NEXT: je LBB22_30
+; KNL-NEXT: ## BB#29:
+; KNL-NEXT: movb $-1, %cl
+; KNL-NEXT: LBB22_30:
+; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT: movl $275, %eax ## imm = 0x113
+; KNL-NEXT: bextrl %eax, %edi, %r11d
+; KNL-NEXT: movzbl %dl, %edx
+; KNL-NEXT: vmovd %esi, %xmm3
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $31, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_32
+; KNL-NEXT: ## BB#31:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_32:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT: movl $276, %eax ## imm = 0x114
+; KNL-NEXT: bextrl %eax, %edi, %esi
+; KNL-NEXT: movl $277, %r11d ## imm = 0x115
+; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r10b, %r10d
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_34
+; KNL-NEXT: ## BB#33:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_34:
+; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
+; KNL-NEXT: bextrl %r11d, %edi, %edx
+; KNL-NEXT: movl $278, %r11d ## imm = 0x116
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r9b, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shlq $63, %rcx
+; KNL-NEXT: sarq $63, %rcx
+; KNL-NEXT: vmovd %ecx, %xmm4
+; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $2, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_36
+; KNL-NEXT: ## BB#35:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_36:
+; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %r11d, %edi, %edx
+; KNL-NEXT: movl $279, %r9d ## imm = 0x117
+; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3
+; KNL-NEXT: movzbl %bl, %ebx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $3, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_38
+; KNL-NEXT: ## BB#37:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_38:
+; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %r9d, %edi, %edx
+; KNL-NEXT: movl $280, %esi ## imm = 0x118
+; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r12b, %ebx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $4, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_40
+; KNL-NEXT: ## BB#39:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_40:
+; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %esi, %edi, %ecx
+; KNL-NEXT: movl $281, %edx ## imm = 0x119
+; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r14b, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $5, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_42
+; KNL-NEXT: ## BB#41:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_42:
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %ecx
+; KNL-NEXT: movl $282, %edx ## imm = 0x11A
+; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r8b, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %bl
+; KNL-NEXT: shrb $6, %bl
+; KNL-NEXT: andb $1, %bl
+; KNL-NEXT: je LBB22_44
+; KNL-NEXT: ## BB#43:
+; KNL-NEXT: movb $-1, %bl
+; KNL-NEXT: LBB22_44:
+; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %eax
+; KNL-NEXT: movl $283, %ecx ## imm = 0x11B
+; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r13b, %esi
+; KNL-NEXT: movzbl %bl, %edx
+; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %bl
+; KNL-NEXT: shrb $7, %bl
+; KNL-NEXT: je LBB22_46
+; KNL-NEXT: ## BB#45:
+; KNL-NEXT: movb $-1, %bl
+; KNL-NEXT: LBB22_46:
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: movl $284, %edx ## imm = 0x11C
+; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
+; KNL-NEXT: movzbl %al, %esi
+; KNL-NEXT: movzbl %bl, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $8, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_48
+; KNL-NEXT: ## BB#47:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_48:
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %ecx
+; KNL-NEXT: movl $285, %edx ## imm = 0x11D
+; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT: movzbl %sil, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $9, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_50
+; KNL-NEXT: ## BB#49:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_50:
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %ecx
+; KNL-NEXT: movl $286, %edx ## imm = 0x11E
+; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT: movzbl %sil, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $10, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_52
+; KNL-NEXT: ## BB#51:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_52:
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %edx
+; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $11, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_54
+; KNL-NEXT: ## BB#53:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_54:
+; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
+; KNL-NEXT: shrl $31, %edi
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $12, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_56
+; KNL-NEXT: ## BB#55:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_56:
+; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $13, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_58
+; KNL-NEXT: ## BB#57:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_58:
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $14, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB22_60
+; KNL-NEXT: ## BB#59:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB22_60:
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2
+; KNL-NEXT: shrq $15, %r15
+; KNL-NEXT: andb $1, %r15b
+; KNL-NEXT: je LBB22_62
+; KNL-NEXT: ## BB#61:
+; KNL-NEXT: movb $-1, %r15b
+; KNL-NEXT: LBB22_62:
+; KNL-NEXT: movzbl %r15b, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT: leaq -40(%rbp), %rsp
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: popq %r12
+; KNL-NEXT: popq %r13
+; KNL-NEXT: popq %r14
+; KNL-NEXT: popq %r15
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test16:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovq %rdi, %k0
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: kshiftrw $15, %k1, %k1
+; SKX-NEXT: kshiftlq $5, %k1, %k1
+; SKX-NEXT: korq %k1, %k0, %k0
+; SKX-NEXT: vpmovm2b %k0, %zmm0
+; SKX-NEXT: retq
%a = bitcast i64 %x to <64 x i1>
%b = insertelement <64 x i1>%a, i1 true, i32 5
%c = sext <64 x i1>%b to <64 x i8>
ret <64 x i8>%c
}
-; SKX-LABEL: test17
-; SKX: setg %al
-; SKX: andl $1, %eax
-; SKX: kmovw %eax, %k1
-; SKX: kshiftlq $5, %k1, %k1
-; SKX: korq %k1, %k0, %k0
-; SKX: vpmovm2b %k0, %zmm0
define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
+; KNL-LABEL: test17:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Ltmp8:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Ltmp9:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Ltmp10:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: pushq %r15
+; KNL-NEXT: pushq %r14
+; KNL-NEXT: pushq %r13
+; KNL-NEXT: pushq %r12
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: Ltmp11:
+; KNL-NEXT: .cfi_offset %rbx, -56
+; KNL-NEXT: Ltmp12:
+; KNL-NEXT: .cfi_offset %r12, -48
+; KNL-NEXT: Ltmp13:
+; KNL-NEXT: .cfi_offset %r13, -40
+; KNL-NEXT: Ltmp14:
+; KNL-NEXT: .cfi_offset %r14, -32
+; KNL-NEXT: Ltmp15:
+; KNL-NEXT: .cfi_offset %r15, -24
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: shrq $32, %rax
+; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl %edi, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: movl $257, %eax ## imm = 0x101
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $258, %eax ## imm = 0x102
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $259, %eax ## imm = 0x103
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $260, %eax ## imm = 0x104
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $261, %eax ## imm = 0x105
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $262, %eax ## imm = 0x106
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $263, %eax ## imm = 0x107
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $264, %eax ## imm = 0x108
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $265, %eax ## imm = 0x109
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $266, %eax ## imm = 0x10A
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $267, %eax ## imm = 0x10B
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $268, %eax ## imm = 0x10C
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $269, %eax ## imm = 0x10D
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $270, %eax ## imm = 0x10E
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: movl $271, %eax ## imm = 0x10F
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT: cmpl %edx, %esi
+; KNL-NEXT: setg %al
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d
+; KNL-NEXT: movq %r15, %rdx
+; KNL-NEXT: shrq $17, %rdx
+; KNL-NEXT: andb $1, %dl
+; KNL-NEXT: je LBB23_2
+; KNL-NEXT: ## BB#1:
+; KNL-NEXT: movb $-1, %dl
+; KNL-NEXT: LBB23_2:
+; KNL-NEXT: movq %r15, %r11
+; KNL-NEXT: shrq $16, %r11
+; KNL-NEXT: andb $1, %r11b
+; KNL-NEXT: je LBB23_4
+; KNL-NEXT: ## BB#3:
+; KNL-NEXT: movb $-1, %r11b
+; KNL-NEXT: LBB23_4:
+; KNL-NEXT: movq %r15, %r10
+; KNL-NEXT: shrq $18, %r10
+; KNL-NEXT: andb $1, %r10b
+; KNL-NEXT: je LBB23_6
+; KNL-NEXT: ## BB#5:
+; KNL-NEXT: movb $-1, %r10b
+; KNL-NEXT: LBB23_6:
+; KNL-NEXT: movq %r15, %r9
+; KNL-NEXT: shrq $19, %r9
+; KNL-NEXT: andb $1, %r9b
+; KNL-NEXT: je LBB23_8
+; KNL-NEXT: ## BB#7:
+; KNL-NEXT: movb $-1, %r9b
+; KNL-NEXT: LBB23_8:
+; KNL-NEXT: movq %r15, %rbx
+; KNL-NEXT: shrq $20, %rbx
+; KNL-NEXT: andb $1, %bl
+; KNL-NEXT: je LBB23_10
+; KNL-NEXT: ## BB#9:
+; KNL-NEXT: movb $-1, %bl
+; KNL-NEXT: LBB23_10:
+; KNL-NEXT: movq %r15, %r12
+; KNL-NEXT: shrq $21, %r12
+; KNL-NEXT: andb $1, %r12b
+; KNL-NEXT: je LBB23_12
+; KNL-NEXT: ## BB#11:
+; KNL-NEXT: movb $-1, %r12b
+; KNL-NEXT: LBB23_12:
+; KNL-NEXT: movq %r15, %r14
+; KNL-NEXT: shrq $22, %r14
+; KNL-NEXT: andb $1, %r14b
+; KNL-NEXT: je LBB23_14
+; KNL-NEXT: ## BB#13:
+; KNL-NEXT: movb $-1, %r14b
+; KNL-NEXT: LBB23_14:
+; KNL-NEXT: movq %r15, %r8
+; KNL-NEXT: shrq $23, %r8
+; KNL-NEXT: andb $1, %r8b
+; KNL-NEXT: je LBB23_16
+; KNL-NEXT: ## BB#15:
+; KNL-NEXT: movb $-1, %r8b
+; KNL-NEXT: LBB23_16:
+; KNL-NEXT: movq %r15, %r13
+; KNL-NEXT: shrq $24, %r13
+; KNL-NEXT: andb $1, %r13b
+; KNL-NEXT: je LBB23_18
+; KNL-NEXT: ## BB#17:
+; KNL-NEXT: movb $-1, %r13b
+; KNL-NEXT: LBB23_18:
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $25, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_20
+; KNL-NEXT: ## BB#19:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_20:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $26, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_22
+; KNL-NEXT: ## BB#21:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_22:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movl $272, %esi ## imm = 0x110
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $27, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_24
+; KNL-NEXT: ## BB#23:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_24:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movl $273, %eax ## imm = 0x111
+; KNL-NEXT: bextrl %esi, %edi, %esi
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shrq $28, %rcx
+; KNL-NEXT: andb $1, %cl
+; KNL-NEXT: je LBB23_26
+; KNL-NEXT: ## BB#25:
+; KNL-NEXT: movb $-1, %cl
+; KNL-NEXT: LBB23_26:
+; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: bextrl %eax, %edi, %eax
+; KNL-NEXT: vmovd %esi, %xmm2
+; KNL-NEXT: movl $274, %esi ## imm = 0x112
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shrq $29, %rcx
+; KNL-NEXT: andb $1, %cl
+; KNL-NEXT: je LBB23_28
+; KNL-NEXT: ## BB#27:
+; KNL-NEXT: movb $-1, %cl
+; KNL-NEXT: LBB23_28:
+; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT: bextrl %esi, %edi, %eax
+; KNL-NEXT: movzbl %r11b, %esi
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shrq $30, %rcx
+; KNL-NEXT: andb $1, %cl
+; KNL-NEXT: je LBB23_30
+; KNL-NEXT: ## BB#29:
+; KNL-NEXT: movb $-1, %cl
+; KNL-NEXT: LBB23_30:
+; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT: movl $275, %eax ## imm = 0x113
+; KNL-NEXT: bextrl %eax, %edi, %r11d
+; KNL-NEXT: movzbl %dl, %edx
+; KNL-NEXT: vmovd %esi, %xmm3
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $31, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_32
+; KNL-NEXT: ## BB#31:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_32:
+; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT: movl $276, %eax ## imm = 0x114
+; KNL-NEXT: bextrl %eax, %edi, %esi
+; KNL-NEXT: movl $277, %r11d ## imm = 0x115
+; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r10b, %r10d
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_34
+; KNL-NEXT: ## BB#33:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_34:
+; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
+; KNL-NEXT: bextrl %r11d, %edi, %edx
+; KNL-NEXT: movl $278, %r11d ## imm = 0x116
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r9b, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: movq %r15, %rcx
+; KNL-NEXT: shlq $63, %rcx
+; KNL-NEXT: sarq $63, %rcx
+; KNL-NEXT: vmovd %ecx, %xmm4
+; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $2, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_36
+; KNL-NEXT: ## BB#35:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_36:
+; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %r11d, %edi, %edx
+; KNL-NEXT: movl $279, %r9d ## imm = 0x117
+; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3
+; KNL-NEXT: movzbl %bl, %ebx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $3, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_38
+; KNL-NEXT: ## BB#37:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_38:
+; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %r9d, %edi, %edx
+; KNL-NEXT: movl $280, %esi ## imm = 0x118
+; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r12b, %ebx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $4, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_40
+; KNL-NEXT: ## BB#39:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_40:
+; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %esi, %edi, %ecx
+; KNL-NEXT: movl $281, %edx ## imm = 0x119
+; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r14b, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %al
+; KNL-NEXT: shrb $5, %al
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_42
+; KNL-NEXT: ## BB#41:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_42:
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %ecx
+; KNL-NEXT: movl $282, %edx ## imm = 0x11A
+; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r8b, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %bl
+; KNL-NEXT: shrb $6, %bl
+; KNL-NEXT: andb $1, %bl
+; KNL-NEXT: je LBB23_44
+; KNL-NEXT: ## BB#43:
+; KNL-NEXT: movb $-1, %bl
+; KNL-NEXT: LBB23_44:
+; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %eax
+; KNL-NEXT: movl $283, %ecx ## imm = 0x11B
+; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3
+; KNL-NEXT: movzbl %r13b, %esi
+; KNL-NEXT: movzbl %bl, %edx
+; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
+; KNL-NEXT: movb %r15b, %bl
+; KNL-NEXT: shrb $7, %bl
+; KNL-NEXT: je LBB23_46
+; KNL-NEXT: ## BB#45:
+; KNL-NEXT: movb $-1, %bl
+; KNL-NEXT: LBB23_46:
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: bextrl %ecx, %edi, %ecx
+; KNL-NEXT: movl $284, %edx ## imm = 0x11C
+; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
+; KNL-NEXT: movzbl %al, %esi
+; KNL-NEXT: movzbl %bl, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $8, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_48
+; KNL-NEXT: ## BB#47:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_48:
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %ecx
+; KNL-NEXT: movl $285, %edx ## imm = 0x11D
+; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT: movzbl %sil, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $9, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_50
+; KNL-NEXT: ## BB#49:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_50:
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %ecx
+; KNL-NEXT: movl $286, %edx ## imm = 0x11E
+; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
+; KNL-NEXT: movzbl %sil, %esi
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $10, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_52
+; KNL-NEXT: ## BB#51:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_52:
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
+; KNL-NEXT: bextrl %edx, %edi, %edx
+; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $11, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_54
+; KNL-NEXT: ## BB#53:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_54:
+; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
+; KNL-NEXT: shrl $31, %edi
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $12, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_56
+; KNL-NEXT: ## BB#55:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_56:
+; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $13, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_58
+; KNL-NEXT: ## BB#57:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_58:
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2
+; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; KNL-NEXT: movzbl %cl, %ecx
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
+; KNL-NEXT: movq %r15, %rax
+; KNL-NEXT: shrq $14, %rax
+; KNL-NEXT: andb $1, %al
+; KNL-NEXT: je LBB23_60
+; KNL-NEXT: ## BB#59:
+; KNL-NEXT: movb $-1, %al
+; KNL-NEXT: LBB23_60:
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
+; KNL-NEXT: movzbl %al, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2
+; KNL-NEXT: shrq $15, %r15
+; KNL-NEXT: andb $1, %r15b
+; KNL-NEXT: je LBB23_62
+; KNL-NEXT: ## BB#61:
+; KNL-NEXT: movb $-1, %r15b
+; KNL-NEXT: LBB23_62:
+; KNL-NEXT: movzbl %r15b, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT: leaq -40(%rbp), %rsp
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: popq %r12
+; KNL-NEXT: popq %r13
+; KNL-NEXT: popq %r14
+; KNL-NEXT: popq %r15
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test17:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovq %rdi, %k0
+; SKX-NEXT: cmpl %edx, %esi
+; SKX-NEXT: setg %al
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kshiftlq $5, %k1, %k1
+; SKX-NEXT: korq %k1, %k0, %k0
+; SKX-NEXT: vpmovm2b %k0, %zmm0
+; SKX-NEXT: retq
%a = bitcast i64 %x to <64 x i1>
%b = icmp sgt i32 %y, %z
%c = insertelement <64 x i1>%a, i1 %b, i32 5
ret <64 x i8>%d
}
-; KNL-LABEL: test18
define <8 x i1> @test18(i8 %a, i16 %y) {
+; KNL-LABEL: test18:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl %dil, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %esi, %k1
+; KNL-NEXT: kshiftlw $7, %k1, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kshiftlw $6, %k1, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kshiftlw $6, %k1, %k1
+; KNL-NEXT: korw %k1, %k0, %k0
+; KNL-NEXT: kshiftlw $7, %k2, %k1
+; KNL-NEXT: korw %k1, %k0, %k1
+; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqw %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test18:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb %edi, %k0
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kshiftlw $6, %k1, %k2
+; SKX-NEXT: kshiftrw $15, %k2, %k2
+; SKX-NEXT: kshiftlw $7, %k1, %k1
+; SKX-NEXT: kshiftrw $15, %k1, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftlb $6, %k2, %k2
+; SKX-NEXT: korb %k2, %k0, %k0
+; SKX-NEXT: korb %k1, %k0, %k0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%b1 = bitcast i16 %y to <16 x i1>
%el1 = extractelement <16 x i1>%b1, i32 8
%d = insertelement <8 x i1>%c, i1 %el2, i32 6
ret <8 x i1>%d
}
-
-; KNL-LABEL: test21
-; KNL: vpand %ymm
-; KNL: vextracti128 $1, %ymm2
-; KNL: vpand %ymm
-
-; SKX-LABEL: test21
-; SKX: vpmovb2m
-; SKX: vmovdqu16 {{.*}}%k1
-
define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
+; KNL-LABEL: test21:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT: vpsllw $15, %ymm3, %ymm3
+; KNL-NEXT: vpsraw $15, %ymm3, %ymm3
+; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test21:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT: vpmovb2m %ymm1, %k1
+; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
-; SKX-LABEL: test22
-; SKX: kmovb
define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
+; KNL-LABEL: test22:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrd $3, %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vpextrd $2, %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vpextrd $1, %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test22:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
store <4 x i1> %a, <4 x i1>* %addr
ret void
}
-; SKX-LABEL: test23
-; SKX: kmovb
define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
+; KNL-LABEL: test23:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test23:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vpmovq2m %xmm0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
store <2 x i1> %a, <2 x i1>* %addr
ret void
}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512dq -mattr=+avx512vl| FileCheck %s
define <8 x i1> @test(<2 x i1> %a) {
; CHECK-LABEL: test:
; CHECK: # BB#0:
+; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
define <8 x i1> @test1(<2 x i1> %a) {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
+; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
define <8 x i1> @test2(<2 x i1> %a) {
; CHECK-LABEL: test2:
; CHECK: # BB#0:
+; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: vpmovm2q %k0, %zmm0
; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
define <8 x i1> @test3(<4 x i1> %a) {
; CHECK-LABEL: test3:
; CHECK: # BB#0:
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vpmovd2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: kshiftrb $4, %k0, %k0
define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test4:
; CHECK: # BB#0:
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vpmovd2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: kshiftrb $4, %k0, %k1
define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test5:
; CHECK: # BB#0:
+; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: kshiftlw $2, %k0, %k0
; CHECK-NEXT: kshiftrw $2, %k0, %k1
define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test6:
; CHECK: # BB#0:
+; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: kshiftlw $2, %k0, %k0
; CHECK-NEXT: kshiftrw $2, %k0, %k1
define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test7:
; CHECK: # BB#0:
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vpmovd2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: kshiftrb $4, %k0, %k1
define <64 x i1> @test8(<8 x i1> %a, <8 x i1>%b) {
; CHECK-LABEL: test8:
; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $15, %xmm1, %xmm1
; CHECK-NEXT: vpmovw2m %xmm1, %k0
+; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %xmm0, %k1
; CHECK-NEXT: kunpckdq %k1, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %zmm0
ret i64 %res
}
+declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
+
+define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtb2mask_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp9:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0)
+ ret i64 %res
+}
+
+declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>)
+
+define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtw2mask_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtw2mask_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpmovw2m %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16> %x0)
+ ret i32 %res
+}
+
declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64)
define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
ret <32 x i16> %res4
}
+declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>)
+
+define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovb2m %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0)
+ ret i16 %res
+}
+
+declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>)
+
+define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovb2m %ymm0, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0)
+ ret i32 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>)
+
+define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovw2m %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0)
+ ret i8 %res
+}
+
+declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>)
+
+define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovw2m %ymm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0)
+ ret i16 %res
+}
+
declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16)
define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) {
ret <16 x i32> %res4
}
+declare i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32>)
+
+define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovd2m %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0)
+ ret i16 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64>)
+
+define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovq2m %zmm0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0)
+ ret i8 %res
+}
+
declare <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16)
define <16 x i32>@test_int_x86_avx512_cvtmask2d_512(i16 %x0) {
%res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0)
ret <8 x i64> %res
}
-
ret <4 x i32> %res4
}
+declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>)
+
+define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32>)
+
+define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovd2m %ymm0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>)
+
+define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovq2m %xmm0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>)
+
+define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovq2m %ymm0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0)
+ ret i8 %res
+}
+
declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8)
define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) {
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_64-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
; KNL_64-NEXT: retq
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
-; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0
+; KNL_32-NEXT: vpsllvq .LCPI14_0, %zmm0, %zmm0
; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
; SKX: # BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test15:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
;
; SKX-LABEL: test16:
; SKX: # BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
; SKX-NEXT: vmovaps %zmm2, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test16:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
; KNL_64: # BB#0:
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
;
; SKX-LABEL: test17:
; SKX: # BB#0:
+; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovaps %zmm2, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test17:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2
+; KNL_32-NEXT: vpsllvq .LCPI17_0, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
; SKX: # BB#0:
+; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vpmovd2m %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test18:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
+; SKX_32-NEXT: vpmovd2m %xmm2, %k1
+; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
+; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
ret void
}
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
; KNL_64-NEXT: retq
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpsllvq .LCPI18_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test19:
; SKX: # BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test19:
; SKX_32: # BB#0:
+; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vpmovd2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
; KNL_64-LABEL: test20:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; KNL_64-NEXT: vmovq %xmm2, %xmm2
+; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
; KNL_32-LABEL: test20:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; KNL_32-NEXT: vmovq %xmm2, %xmm2
+; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2
+; KNL_32-NEXT: vpsllvq .LCPI19_0, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
; SKX: # BB#0:
+; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vpmovq2m %xmm2, %k0
; SKX-NEXT: kshiftlw $2, %k0, %k0
; SKX-NEXT: kshiftrw $2, %k0, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test20:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT: vpmovq2m %xmm2, %k0
+; SKX_32-NEXT: kshiftlw $2, %k0, %k0
+; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
+; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
ret void
}
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2
+; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test21:
; SKX: # BB#0:
+; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vpmovq2m %xmm2, %k0
; SKX-NEXT: kshiftlw $2, %k0, %k0
; SKX-NEXT: kshiftrw $2, %k0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test21:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT: vpmovq2m %xmm2, %k0
+; SKX_32-NEXT: kshiftlw $2, %k0, %k0
+; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
ret void
}
; KNL_64-LABEL: test22:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; KNL_64-NEXT: vmovq %xmm1, %xmm1
+; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-LABEL: test22:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; KNL_32-NEXT: vmovq %xmm1, %xmm1
+; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpsllvq .LCPI21_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
; SKX-LABEL: test22:
; SKX: # BB#0:
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k0
; SKX-NEXT: kshiftlw $2, %k0, %k0
; SKX-NEXT: kshiftrw $2, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %zmm2, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test22:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k0
+; SKX_32-NEXT: kshiftlw $2, %k0, %k0
+; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
%res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
; KNL_64: # BB#0:
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
;
; SKX-LABEL: test23:
; SKX: # BB#0:
+; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovaps %zmm2, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test23:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
%res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
}
define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
-;
-;
; KNL_64-LABEL: test24:
; KNL_64: # BB#0:
; KNL_64-NEXT: movb $3, %al
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
-; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1
+; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test24:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
+; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
%res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
; KNL_64: # BB#0:
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
-; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1
+; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
;
; SKX-LABEL: test25:
; SKX: # BB#0:
+; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovaps %zmm2, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test25:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
%res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
-; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2
+; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test26:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
+; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
%res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
-; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2
+; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
; SKX-NEXT: kmovb %eax, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test28:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT: movb $3, %al
+; SKX_32-NEXT: kmovb %eax, %k1
+; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
ret void
}
;
; SKX-LABEL: test30:
; SKX: # BB#0:
+; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vpmovd2m %xmm2, %k1
; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
; SKX-NEXT: vmovaps %zmm3, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test30:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: subl $12, %esp
+; SKX_32-NEXT: .Ltmp0:
+; SKX_32-NEXT: .cfi_def_cfa_offset 16
+; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
+; SKX_32-NEXT: vpmovd2m %xmm2, %k1
+; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
+; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
+; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT: # implicit-def: %XMM1
+; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: je .LBB29_2
+; SKX_32-NEXT: # BB#1: # %cond.load
+; SKX_32-NEXT: vmovd %xmm2, %eax
+; SKX_32-NEXT: vmovd (%eax), %xmm1
+; SKX_32-NEXT: .LBB29_2: # %else
+; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
+; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
+; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: je .LBB29_4
+; SKX_32-NEXT: # BB#3: # %cond.load1
+; SKX_32-NEXT: vpextrd $1, %xmm2, %eax
+; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: .LBB29_4: # %else2
+; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm0
+; SKX_32-NEXT: kmovb %k1, (%esp)
+; SKX_32-NEXT: movb (%esp), %al
+; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: je .LBB29_6
+; SKX_32-NEXT: # BB#5: # %cond.load4
+; SKX_32-NEXT: vpextrd $2, %xmm2, %eax
+; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: .LBB29_6: # %else5
+; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; SKX_32-NEXT: addl $12, %esp
+; SKX_32-NEXT: retl
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
; KNL_64-LABEL: test_gather_16i32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-LABEL: test_gather_16i32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
; SKX-LABEL: test_gather_16i32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-LABEL: test_gather_16i32:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vmovaps %zmm2, %zmm0
; KNL_64-LABEL: test_gather_16i64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; SKX-LABEL: test_gather_16i64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vmovaps %zmm3, %zmm0
; SKX-NEXT: vmovaps %zmm4, %zmm1
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_gather_16i64:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: pushl %ebp
+; SKX_32-NEXT: .Ltmp1:
+; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: .Ltmp2:
+; SKX_32-NEXT: .cfi_offset %ebp, -8
+; SKX_32-NEXT: movl %esp, %ebp
+; SKX_32-NEXT: .Ltmp3:
+; SKX_32-NEXT: .cfi_def_cfa_register %ebp
+; SKX_32-NEXT: andl $-64, %esp
+; SKX_32-NEXT: subl $64, %esp
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; SKX_32-NEXT: kshiftrw $8, %k1, %k2
+; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
+; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: movl %ebp, %esp
+; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: retl
%res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
}
; KNL_64-LABEL: test_gather_16f32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-LABEL: test_gather_16f32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
; SKX-LABEL: test_gather_16f32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_gather_16f32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
ret <16 x float> %res
}
; KNL_64-LABEL: test_gather_16f64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; SKX-LABEL: test_gather_16f64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vmovaps %zmm3, %zmm0
; SKX-NEXT: vmovaps %zmm4, %zmm1
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_gather_16f64:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: pushl %ebp
+; SKX_32-NEXT: .Ltmp4:
+; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: .Ltmp5:
+; SKX_32-NEXT: .cfi_offset %ebp, -8
+; SKX_32-NEXT: movl %esp, %ebp
+; SKX_32-NEXT: .Ltmp6:
+; SKX_32-NEXT: .cfi_def_cfa_register %ebp
+; SKX_32-NEXT: andl $-64, %esp
+; SKX_32-NEXT: subl $64, %esp
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
+; SKX_32-NEXT: kshiftrw $8, %k1, %k2
+; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
+; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: movl %ebp, %esp
+; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: retl
%res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
}
; KNL_64-LABEL: test_scatter_16i32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; KNL_32-LABEL: test_scatter_16i32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
; KNL_32-NEXT: retl
; SKX-LABEL: test_scatter_16i32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; SKX_32-LABEL: test_scatter_16i32:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
; SKX_32-NEXT: retl
; KNL_64-LABEL: test_scatter_16i64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; SKX-LABEL: test_scatter_16i64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_scatter_16i64:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: pushl %ebp
+; SKX_32-NEXT: .Ltmp7:
+; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: .Ltmp8:
+; SKX_32-NEXT: .cfi_offset %ebp, -8
+; SKX_32-NEXT: movl %esp, %ebp
+; SKX_32-NEXT: .Ltmp9:
+; SKX_32-NEXT: .cfi_def_cfa_register %ebp
+; SKX_32-NEXT: andl $-64, %esp
+; SKX_32-NEXT: subl $64, %esp
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; SKX_32-NEXT: kshiftrw $8, %k1, %k2
+; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
+; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
+; SKX_32-NEXT: movl %ebp, %esp
+; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
ret void
}
; KNL_64-LABEL: test_scatter_16f32:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; KNL_32-LABEL: test_scatter_16f32:
; KNL_32: # BB#0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
; KNL_32-NEXT: retl
; SKX-LABEL: test_scatter_16f32:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_scatter_16f32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
ret void
}
; KNL_64-LABEL: test_scatter_16f64:
; KNL_64: # BB#0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; SKX-LABEL: test_scatter_16f64:
; SKX: # BB#0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_scatter_16f64:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: pushl %ebp
+; SKX_32-NEXT: .Ltmp10:
+; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: .Ltmp11:
+; SKX_32-NEXT: .cfi_offset %ebp, -8
+; SKX_32-NEXT: movl %esp, %ebp
+; SKX_32-NEXT: .Ltmp12:
+; SKX_32-NEXT: .cfi_def_cfa_register %ebp
+; SKX_32-NEXT: andl $-64, %esp
+; SKX_32-NEXT: subl $64, %esp
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
+; SKX_32-NEXT: kshiftrw $8, %k1, %k2
+; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
+; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
+; SKX_32-NEXT: movl %ebp, %esp
+; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
ret void
}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
-; AVX512-LABEL: test24
-; AVX512: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
-; AVX512: kshiftrw $8, %k1, %k1
-; AVX512: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
-
define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
+; AVX512-LABEL: test24:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT: kshiftrw $8, %k1, %k1
+; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test24:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm1, %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm1, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm0, %ymm1
+; AVX2-NEXT: vmovdqa %ymm4, %ymm0
+; AVX2-NEXT: retq
+;
+; SKX-LABEL: test24:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
+; SKX-NEXT: retq
%res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
ret <16 x %mystruct*> %res
}
define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; AVX512-LABEL: test_store_16i64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: kshiftrw $8, %k1, %k1
+; AVX512-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_store_16i64:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT: vpmaskmovq %ymm1, %ymm5, (%rdi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmaskmovq %ymm4, %ymm1, 96(%rdi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmaskmovq %ymm3, %ymm1, 64(%rdi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmaskmovq %ymm2, %ymm0, 32(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
; SKX-LABEL: test_store_16i64:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
}
declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; AVX512-LABEL: test_store_16f64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: kshiftrw $8, %k1, %k1
+; AVX512-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_store_16f64:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
; SKX-LABEL: test_store_16f64:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
}
declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; AVX512-LABEL: test_load_16i64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512-NEXT: kshiftrw $8, %k1, %k1
+; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %zmm2, %zmm1
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_load_16i64:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm5, %ymm9
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
+; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
+; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
+; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm7, %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
+; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
+; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm6, %ymm10
+; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
+; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
+; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm0, %ymm3
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vmovapd %ymm5, %ymm0
+; AVX2-NEXT: retq
+;
; SKX-LABEL: test_load_16i64:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
}
declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; AVX512-LABEL: test_load_16f64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512-NEXT: kshiftrw $8, %k1, %k1
+; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %zmm2, %zmm1
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_load_16f64:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm9
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
+; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
+; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
+; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm7, %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
+; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
+; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm10
+; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
+; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
+; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vmovapd %ymm5, %ymm0
+; AVX2-NEXT: retq
+;
; SKX-LABEL: test_load_16f64:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
+; AVX512-LABEL: test_load_32f64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5
+; AVX512-NEXT: vpslld $31, %zmm5, %zmm5
+; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k1
+; AVX512-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k2
+; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k2}
+; AVX512-NEXT: kshiftrw $8, %k1, %k1
+; AVX512-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512-NEXT: kshiftrw $8, %k2, %k1
+; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %zmm2, %zmm1
+; AVX512-NEXT: vmovaps %zmm3, %zmm2
+; AVX512-NEXT: vmovaps %zmm4, %zmm3
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_load_32f64:
+; AVX2: ## BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: Ltmp0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: Ltmp1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: Ltmp2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm8, %xmm8
+; AVX2-NEXT: vpsrad $31, %xmm8, %xmm8
+; AVX2-NEXT: vpmovsxdq %xmm8, %ymm8
+; AVX2-NEXT: vmaskmovpd 32(%rsi), %ymm8, %ymm9
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm10, %xmm10
+; AVX2-NEXT: vpsrad $31, %xmm10, %xmm10
+; AVX2-NEXT: vpmovsxdq %xmm10, %ymm10
+; AVX2-NEXT: vmaskmovpd 64(%rsi), %ymm10, %ymm11
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm12, %xmm12
+; AVX2-NEXT: vpsrad $31, %xmm12, %xmm12
+; AVX2-NEXT: vpmovsxdq %xmm12, %ymm12
+; AVX2-NEXT: vmaskmovpd 96(%rsi), %ymm12, %ymm13
+; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm2, %ymm8
+; AVX2-NEXT: vblendvpd %ymm10, %ymm11, %ymm3, %ymm9
+; AVX2-NEXT: vblendvpd %ymm12, %ymm13, %ymm4, %ymm11
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm4, %xmm4
+; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
+; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4
+; AVX2-NEXT: vmaskmovpd 192(%rsi), %ymm4, %ymm12
+; AVX2-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm3
+; AVX2-NEXT: vmovapd 16(%rbp), %ymm6
+; AVX2-NEXT: vblendvpd %ymm4, %ymm12, %ymm7, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
+; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
+; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
+; AVX2-NEXT: vmaskmovpd 224(%rsi), %ymm7, %ymm10
+; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm6, %ymm6
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm7
+; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm1, %ymm0
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vmaskmovpd 128(%rsi), %ymm1, %ymm2
+; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
+; AVX2-NEXT: vmovapd %ymm1, 128(%rdi)
+; AVX2-NEXT: vmovapd %ymm0, (%rdi)
+; AVX2-NEXT: vmovapd %ymm6, 224(%rdi)
+; AVX2-NEXT: vmovapd %ymm4, 192(%rdi)
+; AVX2-NEXT: vmovapd %ymm3, 160(%rdi)
+; AVX2-NEXT: vmovapd %ymm11, 96(%rdi)
+; AVX2-NEXT: vmovapd %ymm9, 64(%rdi)
+; AVX2-NEXT: vmovapd %ymm8, 32(%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
; SKX-LABEL: test_load_32f64:
; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
; SKX-NEXT: kshiftrd $16, %k1, %k2
; AVX512F-LABEL: test_vshuff64x2_512_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
; AVX512F-32-LABEL: test_vshuff64x2_512_maskz:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-32-NEXT: vpandq .LCPI122_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpsllvq .LCPI122_0, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
; AVX512F-LABEL: test_vshufi64x2_512_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
; AVX512F-32-LABEL: test_vshufi64x2_512_mask:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-32-NEXT: vpandq .LCPI123_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpsllvq .LCPI123_0, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
; AVX512F-LABEL: test_vshuff64x2_512_mem_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-NEXT: retq
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-32-NEXT: vpandq .LCPI125_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsllvq .LCPI125_0, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-LABEL: test_vshuff64x2_512_mem_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-NEXT: retq
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-32-NEXT: vpandq .LCPI126_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsllvq .LCPI126_0, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
;
; VL_BW_DQ-LABEL: shuf2i1_1_0:
; VL_BW_DQ: # BB#0:
+; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf2i1_1_2:
; VL_BW_DQ: # BB#0:
+; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: movb $1, %al
; VL_BW_DQ-NEXT: kmovb %eax, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0
+; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
; VL_BW_DQ: # BB#0:
+; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: retq
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: retq
; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
-; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1
; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
-; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0
+; VL_BW_DQ-NEXT: vpslld $31, %zmm1, %zmm0
+; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <16 x i32> %a, %a1
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; VL_BW_DQ: # BB#0:
+; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT: vpsllw $15, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
; VL_BW_DQ-NEXT: retq
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: retq
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
; VL_BW_DQ-NEXT: kmovb %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
+; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
-; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm3
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3
-; AVX512F-NEXT: vpandq %zmm3, %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
+; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
; VL_BW_DQ: # BB#0:
+; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
+; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
-; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
; VL_BW_DQ-NEXT: kmovw %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
+; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovw %k0, %eax
; VL_BW_DQ-NEXT: retq
}
define i64 @shuf64i1_zero(i64 %a) {
+; AVX512F-LABEL: shuf64i1_zero:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Ltmp0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Ltmp1:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Ltmp2:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: movb $0, (%rsp)
+; AVX512F-NEXT: movl (%rsp), %ecx
+; AVX512F-NEXT: movq %rcx, %rax
+; AVX512F-NEXT: shlq $32, %rax
+; AVX512F-NEXT: orq %rcx, %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf64i1_zero:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: kxorq %k0, %k0, %k0