[llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
[IntrNoMem]>;
+ def int_x86_avx512_mask_vpermil_pd_128 :
+ GCCBuiltin<"__builtin_ia32_vpermilpd_mask">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermil_pd_256 :
+ GCCBuiltin<"__builtin_ia32_vpermilpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermil_pd_512 :
+ GCCBuiltin<"__builtin_ia32_vpermilpd512_mask">,
+ Intrinsic<[llvm_v8f64_ty],
+ [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermil_ps_128 :
+ GCCBuiltin<"__builtin_ia32_vpermilps_mask">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermil_ps_256 :
+ GCCBuiltin<"__builtin_ia32_vpermilps256_mask">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermil_ps_512 :
+ GCCBuiltin<"__builtin_ia32_vpermilps512_mask">,
+ Intrinsic<[llvm_v16f32_ty],
+ [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermilvar_pd_256 :
+ GCCBuiltin<"__builtin_ia32_vpermilvarpd256_mask">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermilvar_pd_512 :
+ GCCBuiltin<"__builtin_ia32_vpermilvarpd512_mask">,
+ Intrinsic<[llvm_v8f64_ty],
+ [llvm_v8f64_ty, llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermilvar_pd_128 :
+ GCCBuiltin<"__builtin_ia32_vpermilvarpd_mask">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermilvar_ps_256 :
+ GCCBuiltin<"__builtin_ia32_vpermilvarps256_mask">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermilvar_ps_512 :
+ GCCBuiltin<"__builtin_ia32_vpermilvarps512_mask">,
+ Intrinsic<[llvm_v16f32_ty],
+ [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty],
+ [IntrNoMem]>;
+
+ def int_x86_avx512_mask_vpermilvar_ps_128 :
+ GCCBuiltin<"__builtin_ia32_vpermilvarps_mask">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
+ [IntrNoMem]>;
+
def int_x86_avx512_mask_pshuf_b_128 :
GCCBuiltin<"__builtin_ia32_pshufb128_mask">,
Intrinsic<[llvm_v16i8_ty],
RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
- case INTR_TYPE_2OP_MASK: {
+ case INTR_TYPE_2OP_MASK:
+ case INTR_TYPE_2OP_IMM8_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
+
+ if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+ Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
}
//===----------------------------------------------------------------------===//
-// AVX-512 - VPERM
-//
-// -- immediate form --
-multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
- def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src1, u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
- EVEX;
- def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst),
- (ins _.MemOp:$src1, u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (OpNode (_.LdFrag addr:$src1),
- (i8 imm:$src2))))]>,
- EVEX, EVEX_CD8<_.EltSize, CD8VF>;
-}
-}
-
-multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _,
- X86VectorVTInfo Ctrl> :
- avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> {
- let ExeDomain = _.ExeDomain in {
- def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2),
- !strconcat("vpermil" # _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (X86VPermilpv _.RC:$src1,
- (Ctrl.VT Ctrl.RC:$src2))))]>,
- EVEX_4V;
- def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst),
- (ins _.RC:$src1, Ctrl.MemOp:$src2),
- !strconcat("vpermil" # _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (X86VPermilpv _.RC:$src1,
- (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>,
- EVEX_4V;
- }
-}
-defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>,
- EVEX_V512;
-defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>,
- EVEX_V512, VEX_W;
-
-def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
- (VPERMILPSZri VR512:$src1, imm:$imm)>;
-def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
- (VPERMILPDZri VR512:$src1, imm:$imm)>;
-
// -- VPERM2I - 3 source operands form --
multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86VectorVTInfo _> {
defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
X86VPermi, avx512vl_f64_info>,
EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+ defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (Ctrl.VT Ctrl.RC:$src2)))>,
+ T8PD, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (X86VBroadcast
+ (Ctrl.ScalarLdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ }//let mayLoad = 1
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
+ Ctrl.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
+ Ctrl.info128>, EVEX_V128;
+ defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
+ Ctrl.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+
+ defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+ defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+ X86VPermilpi, _>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+
+ let isCodeGenOnly = 1 in {
+ // lowering implementation with the alternative types
+ defm NAME#_I: avx512_permil_vec_common<OpcodeStr, OpcVar, Ctrl, Ctrl>;
+ defm NAME#_I: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem,
+ OpcodeStr, X86VPermilpi, Ctrl>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+ }
+}
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+ avx512vl_i32_info>;
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+ avx512vl_i64_info>, VEX_W;
//===----------------------------------------------------------------------===//
// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
//===----------------------------------------------------------------------===//
(bitconvert (i_frag addr:$src2))))]>, VEX_4V,
Sched<[WriteFShuffleLd, ReadAfterLd]>;
- def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+ let Predicates = [HasAVX, NoVLX] in {
+ def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
Sched<[WriteFShuffle]>;
- def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+ def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
(ins x86memop_f:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
Sched<[WriteFShuffleLd]>;
+ }// Predicates = [HasAVX, NoVLX]
}
let ExeDomain = SSEPackedSingle in {
loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
(VPERMILPSYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI,
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
- INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
+ INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
ret <16 x float> %res2
}
+declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilpd $22, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpermilpd $22, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpermilpd $22, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilps $22, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpermilps $22, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpermilps $22, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res2, %res3
+ ret <8 x double> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i8)
define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i8 %x4) {
ret <4 x i64> %res2
}
+declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm0
+; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8)
define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
; AVX: # BB#0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-NEXT: retq
+
+; AVX512VL-LABEL: shuffle_v2f64_10:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermilpd $1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0>
ret <2 x double> %shuffle
}
; AVX: # BB#0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-NEXT: retq
+
+; AVX512VL-LABEL: shuffle_v2f64_32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermilpd $1, %xmm1, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2>
ret <2 x double> %shuffle
}
; AVX: # BB#0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
; AVX-NEXT: retq
+
+; AVX512VL-LABEL: shuffle_mem_v2f64_10:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermilpd $1, (%rdi), %xmm0
+; AVX512VL-NEXT: retq
%a = load <2 x double>, <2 x double>* %ptr
%shuffle = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0>
ret <2 x double> %shuffle
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mcpu=knl -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
target triple = "x86_64-unknown-unknown"
; ALL: # BB#0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
; ALL-NEXT: retq
+
+; AVX512VL-LABEL: shuffle_v4f64_0023:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermilpd $8, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
ret <4 x double> %shuffle
}
// CHECK: encoding: [0x62,0xa2,0xfd,0x41,0xa2,0x94,0x81,0x00,0x04,0x00,0x00]
vscatterdpd %zmm18, 1024(%rcx, %ymm24,4) {%k1}
+// CHECK: vpermilps $171, %zmm22, %zmm2
+// CHECK: encoding: [0x62,0xb3,0x7d,0x48,0x04,0xd6,0xab]
+ vpermilps $0xab, %zmm22, %zmm2
+
+// CHECK: vpermilps $171, %zmm22, %zmm2 {%k2}
+// CHECK: encoding: [0x62,0xb3,0x7d,0x4a,0x04,0xd6,0xab]
+ vpermilps $0xab, %zmm22, %zmm2 {%k2}
+
+// CHECK: vpermilps $171, %zmm22, %zmm2 {%k2} {z}
+// CHECK: encoding: [0x62,0xb3,0x7d,0xca,0x04,0xd6,0xab]
+ vpermilps $0xab, %zmm22, %zmm2 {%k2} {z}
+
+// CHECK: vpermilps $123, %zmm22, %zmm2
+// CHECK: encoding: [0x62,0xb3,0x7d,0x48,0x04,0xd6,0x7b]
+ vpermilps $0x7b, %zmm22, %zmm2
+
+// CHECK: vpermilps $123, (%rcx), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x48,0x04,0x11,0x7b]
+ vpermilps $0x7b, (%rcx), %zmm2
+
+// CHECK: vpermilps $123, 291(%rax,%r14,8), %zmm2
+// CHECK: encoding: [0x62,0xb3,0x7d,0x48,0x04,0x94,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ vpermilps $0x7b, 291(%rax,%r14,8), %zmm2
+
+// CHECK: vpermilps $123, (%rcx){1to16}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x58,0x04,0x11,0x7b]
+ vpermilps $0x7b, (%rcx){1to16}, %zmm2
+
+// CHECK: vpermilps $123, 8128(%rdx), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x48,0x04,0x52,0x7f,0x7b]
+ vpermilps $0x7b, 8128(%rdx), %zmm2
+
+// CHECK: vpermilps $123, 8192(%rdx), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x48,0x04,0x92,0x00,0x20,0x00,0x00,0x7b]
+ vpermilps $0x7b, 8192(%rdx), %zmm2
+
+// CHECK: vpermilps $123, -8192(%rdx), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x48,0x04,0x52,0x80,0x7b]
+ vpermilps $0x7b, -8192(%rdx), %zmm2
+
+// CHECK: vpermilps $123, -8256(%rdx), %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x48,0x04,0x92,0xc0,0xdf,0xff,0xff,0x7b]
+ vpermilps $0x7b, -8256(%rdx), %zmm2
+
+// CHECK: vpermilps $123, 508(%rdx){1to16}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x58,0x04,0x52,0x7f,0x7b]
+ vpermilps $0x7b, 508(%rdx){1to16}, %zmm2
+
+// CHECK: vpermilps $123, 512(%rdx){1to16}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x58,0x04,0x92,0x00,0x02,0x00,0x00,0x7b]
+ vpermilps $0x7b, 512(%rdx){1to16}, %zmm2
+
+// CHECK: vpermilps $123, -512(%rdx){1to16}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x58,0x04,0x52,0x80,0x7b]
+ vpermilps $0x7b, -512(%rdx){1to16}, %zmm2
+
+// CHECK: vpermilps $123, -516(%rdx){1to16}, %zmm2
+// CHECK: encoding: [0x62,0xf3,0x7d,0x58,0x04,0x92,0xfc,0xfd,0xff,0xff,0x7b]
+ vpermilps $0x7b, -516(%rdx){1to16}, %zmm2
+
+// CHECK: vpermilps %zmm2, %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x40,0x0c,0xea]
+ vpermilps %zmm2, %zmm20, %zmm13
+
+// CHECK: vpermilps %zmm2, %zmm20, %zmm13 {%k1}
+// CHECK: encoding: [0x62,0x72,0x5d,0x41,0x0c,0xea]
+ vpermilps %zmm2, %zmm20, %zmm13 {%k1}
+
+// CHECK: vpermilps %zmm2, %zmm20, %zmm13 {%k1} {z}
+// CHECK: encoding: [0x62,0x72,0x5d,0xc1,0x0c,0xea]
+ vpermilps %zmm2, %zmm20, %zmm13 {%k1} {z}
+
+// CHECK: vpermilps (%rcx), %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x40,0x0c,0x29]
+ vpermilps (%rcx), %zmm20, %zmm13
+
+// CHECK: vpermilps 291(%rax,%r14,8), %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x32,0x5d,0x40,0x0c,0xac,0xf0,0x23,0x01,0x00,0x00]
+ vpermilps 291(%rax,%r14,8), %zmm20, %zmm13
+
+// CHECK: vpermilps (%rcx){1to16}, %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x50,0x0c,0x29]
+ vpermilps (%rcx){1to16}, %zmm20, %zmm13
+
+// CHECK: vpermilps 8128(%rdx), %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x40,0x0c,0x6a,0x7f]
+ vpermilps 8128(%rdx), %zmm20, %zmm13
+
+// CHECK: vpermilps 8192(%rdx), %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x40,0x0c,0xaa,0x00,0x20,0x00,0x00]
+ vpermilps 8192(%rdx), %zmm20, %zmm13
+
+// CHECK: vpermilps -8192(%rdx), %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x40,0x0c,0x6a,0x80]
+ vpermilps -8192(%rdx), %zmm20, %zmm13
+
+// CHECK: vpermilps -8256(%rdx), %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x40,0x0c,0xaa,0xc0,0xdf,0xff,0xff]
+ vpermilps -8256(%rdx), %zmm20, %zmm13
+
+// CHECK: vpermilps 508(%rdx){1to16}, %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x50,0x0c,0x6a,0x7f]
+ vpermilps 508(%rdx){1to16}, %zmm20, %zmm13
+
+// CHECK: vpermilps 512(%rdx){1to16}, %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x50,0x0c,0xaa,0x00,0x02,0x00,0x00]
+ vpermilps 512(%rdx){1to16}, %zmm20, %zmm13
+
+// CHECK: vpermilps -512(%rdx){1to16}, %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x50,0x0c,0x6a,0x80]
+ vpermilps -512(%rdx){1to16}, %zmm20, %zmm13
+
+// CHECK: vpermilps -516(%rdx){1to16}, %zmm20, %zmm13
+// CHECK: encoding: [0x62,0x72,0x5d,0x50,0x0c,0xaa,0xfc,0xfd,0xff,0xff]
+ vpermilps -516(%rdx){1to16}, %zmm20, %zmm13
+
+// CHECK: vpermilpd $171, %zmm4, %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x48,0x05,0xdc,0xab]
+ vpermilpd $0xab, %zmm4, %zmm19
+
+// CHECK: vpermilpd $171, %zmm4, %zmm19 {%k1}
+// CHECK: encoding: [0x62,0xe3,0xfd,0x49,0x05,0xdc,0xab]
+ vpermilpd $0xab, %zmm4, %zmm19 {%k1}
+
+// CHECK: vpermilpd $171, %zmm4, %zmm19 {%k1} {z}
+// CHECK: encoding: [0x62,0xe3,0xfd,0xc9,0x05,0xdc,0xab]
+ vpermilpd $0xab, %zmm4, %zmm19 {%k1} {z}
+
+// CHECK: vpermilpd $123, %zmm4, %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x48,0x05,0xdc,0x7b]
+ vpermilpd $0x7b, %zmm4, %zmm19
+
+// CHECK: vpermilpd $123, (%rcx), %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x48,0x05,0x19,0x7b]
+ vpermilpd $0x7b, (%rcx), %zmm19
+
+// CHECK: vpermilpd $123, 291(%rax,%r14,8), %zmm19
+// CHECK: encoding: [0x62,0xa3,0xfd,0x48,0x05,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 291(%rax,%r14,8), %zmm19
+
+// CHECK: vpermilpd $123, (%rcx){1to8}, %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x58,0x05,0x19,0x7b]
+ vpermilpd $0x7b, (%rcx){1to8}, %zmm19
+
+// CHECK: vpermilpd $123, 8128(%rdx), %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x48,0x05,0x5a,0x7f,0x7b]
+ vpermilpd $0x7b, 8128(%rdx), %zmm19
+
+// CHECK: vpermilpd $123, 8192(%rdx), %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x48,0x05,0x9a,0x00,0x20,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 8192(%rdx), %zmm19
+
+// CHECK: vpermilpd $123, -8192(%rdx), %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x48,0x05,0x5a,0x80,0x7b]
+ vpermilpd $0x7b, -8192(%rdx), %zmm19
+
+// CHECK: vpermilpd $123, -8256(%rdx), %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x48,0x05,0x9a,0xc0,0xdf,0xff,0xff,0x7b]
+ vpermilpd $0x7b, -8256(%rdx), %zmm19
+
+// CHECK: vpermilpd $123, 1016(%rdx){1to8}, %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x58,0x05,0x5a,0x7f,0x7b]
+ vpermilpd $0x7b, 1016(%rdx){1to8}, %zmm19
+
+// CHECK: vpermilpd $123, 1024(%rdx){1to8}, %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x58,0x05,0x9a,0x00,0x04,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 1024(%rdx){1to8}, %zmm19
+
+// CHECK: vpermilpd $123, -1024(%rdx){1to8}, %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x58,0x05,0x5a,0x80,0x7b]
+ vpermilpd $0x7b, -1024(%rdx){1to8}, %zmm19
+
+// CHECK: vpermilpd $123, -1032(%rdx){1to8}, %zmm19
+// CHECK: encoding: [0x62,0xe3,0xfd,0x58,0x05,0x9a,0xf8,0xfb,0xff,0xff,0x7b]
+ vpermilpd $0x7b, -1032(%rdx){1to8}, %zmm19
+
+// CHECK: vpermilpd %zmm21, %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xb2,0xad,0x40,0x0d,0xcd]
+ vpermilpd %zmm21, %zmm26, %zmm1
+
+// CHECK: vpermilpd %zmm21, %zmm26, %zmm1 {%k2}
+// CHECK: encoding: [0x62,0xb2,0xad,0x42,0x0d,0xcd]
+ vpermilpd %zmm21, %zmm26, %zmm1 {%k2}
+
+// CHECK: vpermilpd %zmm21, %zmm26, %zmm1 {%k2} {z}
+// CHECK: encoding: [0x62,0xb2,0xad,0xc2,0x0d,0xcd]
+ vpermilpd %zmm21, %zmm26, %zmm1 {%k2} {z}
+
+// CHECK: vpermilpd (%rcx), %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x40,0x0d,0x09]
+ vpermilpd (%rcx), %zmm26, %zmm1
+
+// CHECK: vpermilpd 291(%rax,%r14,8), %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xb2,0xad,0x40,0x0d,0x8c,0xf0,0x23,0x01,0x00,0x00]
+ vpermilpd 291(%rax,%r14,8), %zmm26, %zmm1
+
+// CHECK: vpermilpd (%rcx){1to8}, %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x50,0x0d,0x09]
+ vpermilpd (%rcx){1to8}, %zmm26, %zmm1
+
+// CHECK: vpermilpd 8128(%rdx), %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x40,0x0d,0x4a,0x7f]
+ vpermilpd 8128(%rdx), %zmm26, %zmm1
+
+// CHECK: vpermilpd 8192(%rdx), %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x40,0x0d,0x8a,0x00,0x20,0x00,0x00]
+ vpermilpd 8192(%rdx), %zmm26, %zmm1
+
+// CHECK: vpermilpd -8192(%rdx), %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x40,0x0d,0x4a,0x80]
+ vpermilpd -8192(%rdx), %zmm26, %zmm1
+
+// CHECK: vpermilpd -8256(%rdx), %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x40,0x0d,0x8a,0xc0,0xdf,0xff,0xff]
+ vpermilpd -8256(%rdx), %zmm26, %zmm1
+
+// CHECK: vpermilpd 1016(%rdx){1to8}, %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x50,0x0d,0x4a,0x7f]
+ vpermilpd 1016(%rdx){1to8}, %zmm26, %zmm1
+
+// CHECK: vpermilpd 1024(%rdx){1to8}, %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x50,0x0d,0x8a,0x00,0x04,0x00,0x00]
+ vpermilpd 1024(%rdx){1to8}, %zmm26, %zmm1
+
+// CHECK: vpermilpd -1024(%rdx){1to8}, %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x50,0x0d,0x4a,0x80]
+ vpermilpd -1024(%rdx){1to8}, %zmm26, %zmm1
+
+// CHECK: vpermilpd -1032(%rdx){1to8}, %zmm26, %zmm1
+// CHECK: encoding: [0x62,0xf2,0xad,0x50,0x0d,0x8a,0xf8,0xfb,0xff,0xff]
+ vpermilpd -1032(%rdx){1to8}, %zmm26, %zmm1
+
// CHECK: vcvtpd2dq %zmm15, %ymm24
// CHECK: encoding: [0x62,0x41,0xff,0x48,0xe6,0xc7]
vcvtpd2dq %zmm15, %ymm24
// CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa2,0xb4,0xb9,0x00,0x04,0x00,0x00]
vscatterdpd %ymm30, 1024(%rcx, %xmm31,4) {%k1}
+// CHECK: vpermilps $171, %xmm28, %xmm20
+// CHECK: encoding: [0x62,0x83,0x7d,0x08,0x04,0xe4,0xab]
+ vpermilps $0xab, %xmm28, %xmm20
+
+// CHECK: vpermilps $171, %xmm28, %xmm20 {%k4}
+// CHECK: encoding: [0x62,0x83,0x7d,0x0c,0x04,0xe4,0xab]
+ vpermilps $0xab, %xmm28, %xmm20 {%k4}
+
+// CHECK: vpermilps $171, %xmm28, %xmm20 {%k4} {z}
+// CHECK: encoding: [0x62,0x83,0x7d,0x8c,0x04,0xe4,0xab]
+ vpermilps $0xab, %xmm28, %xmm20 {%k4} {z}
+
+// CHECK: vpermilps $123, %xmm28, %xmm20
+// CHECK: encoding: [0x62,0x83,0x7d,0x08,0x04,0xe4,0x7b]
+ vpermilps $0x7b, %xmm28, %xmm20
+
+// CHECK: vpermilps $123, (%rcx), %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x08,0x04,0x21,0x7b]
+ vpermilps $0x7b, (%rcx), %xmm20
+
+// CHECK: vpermilps $123, 291(%rax,%r14,8), %xmm20
+// CHECK: encoding: [0x62,0xa3,0x7d,0x08,0x04,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ vpermilps $0x7b, 291(%rax,%r14,8), %xmm20
+
+// CHECK: vpermilps $123, (%rcx){1to4}, %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x18,0x04,0x21,0x7b]
+ vpermilps $0x7b, (%rcx){1to4}, %xmm20
+
+// CHECK: vpermilps $123, 2032(%rdx), %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x08,0x04,0x62,0x7f,0x7b]
+ vpermilps $0x7b, 2032(%rdx), %xmm20
+
+// CHECK: vpermilps $123, 2048(%rdx), %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x08,0x04,0xa2,0x00,0x08,0x00,0x00,0x7b]
+ vpermilps $0x7b, 2048(%rdx), %xmm20
+
+// CHECK: vpermilps $123, -2048(%rdx), %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x08,0x04,0x62,0x80,0x7b]
+ vpermilps $0x7b, -2048(%rdx), %xmm20
+
+// CHECK: vpermilps $123, -2064(%rdx), %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x08,0x04,0xa2,0xf0,0xf7,0xff,0xff,0x7b]
+ vpermilps $0x7b, -2064(%rdx), %xmm20
+
+// CHECK: vpermilps $123, 508(%rdx){1to4}, %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x18,0x04,0x62,0x7f,0x7b]
+ vpermilps $0x7b, 508(%rdx){1to4}, %xmm20
+
+// CHECK: vpermilps $123, 512(%rdx){1to4}, %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x18,0x04,0xa2,0x00,0x02,0x00,0x00,0x7b]
+ vpermilps $0x7b, 512(%rdx){1to4}, %xmm20
+
+// CHECK: vpermilps $123, -512(%rdx){1to4}, %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x18,0x04,0x62,0x80,0x7b]
+ vpermilps $0x7b, -512(%rdx){1to4}, %xmm20
+
+// CHECK: vpermilps $123, -516(%rdx){1to4}, %xmm20
+// CHECK: encoding: [0x62,0xe3,0x7d,0x18,0x04,0xa2,0xfc,0xfd,0xff,0xff,0x7b]
+ vpermilps $0x7b, -516(%rdx){1to4}, %xmm20
+
+// CHECK: vpermilps $171, %ymm17, %ymm30
+// CHECK: encoding: [0x62,0x23,0x7d,0x28,0x04,0xf1,0xab]
+ vpermilps $0xab, %ymm17, %ymm30
+
+// CHECK: vpermilps $171, %ymm17, %ymm30 {%k5}
+// CHECK: encoding: [0x62,0x23,0x7d,0x2d,0x04,0xf1,0xab]
+ vpermilps $0xab, %ymm17, %ymm30 {%k5}
+
+// CHECK: vpermilps $171, %ymm17, %ymm30 {%k5} {z}
+// CHECK: encoding: [0x62,0x23,0x7d,0xad,0x04,0xf1,0xab]
+ vpermilps $0xab, %ymm17, %ymm30 {%k5} {z}
+
+// CHECK: vpermilps $123, %ymm17, %ymm30
+// CHECK: encoding: [0x62,0x23,0x7d,0x28,0x04,0xf1,0x7b]
+ vpermilps $0x7b, %ymm17, %ymm30
+
+// CHECK: vpermilps $123, (%rcx), %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x28,0x04,0x31,0x7b]
+ vpermilps $0x7b, (%rcx), %ymm30
+
+// CHECK: vpermilps $123, 291(%rax,%r14,8), %ymm30
+// CHECK: encoding: [0x62,0x23,0x7d,0x28,0x04,0xb4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ vpermilps $0x7b, 291(%rax,%r14,8), %ymm30
+
+// CHECK: vpermilps $123, (%rcx){1to8}, %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x38,0x04,0x31,0x7b]
+ vpermilps $0x7b, (%rcx){1to8}, %ymm30
+
+// CHECK: vpermilps $123, 4064(%rdx), %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x28,0x04,0x72,0x7f,0x7b]
+ vpermilps $0x7b, 4064(%rdx), %ymm30
+
+// CHECK: vpermilps $123, 4096(%rdx), %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x28,0x04,0xb2,0x00,0x10,0x00,0x00,0x7b]
+ vpermilps $0x7b, 4096(%rdx), %ymm30
+
+// CHECK: vpermilps $123, -4096(%rdx), %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x28,0x04,0x72,0x80,0x7b]
+ vpermilps $0x7b, -4096(%rdx), %ymm30
+
+// CHECK: vpermilps $123, -4128(%rdx), %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x28,0x04,0xb2,0xe0,0xef,0xff,0xff,0x7b]
+ vpermilps $0x7b, -4128(%rdx), %ymm30
+
+// CHECK: vpermilps $123, 508(%rdx){1to8}, %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x38,0x04,0x72,0x7f,0x7b]
+ vpermilps $0x7b, 508(%rdx){1to8}, %ymm30
+
+// CHECK: vpermilps $123, 512(%rdx){1to8}, %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x38,0x04,0xb2,0x00,0x02,0x00,0x00,0x7b]
+ vpermilps $0x7b, 512(%rdx){1to8}, %ymm30
+
+// CHECK: vpermilps $123, -512(%rdx){1to8}, %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x38,0x04,0x72,0x80,0x7b]
+ vpermilps $0x7b, -512(%rdx){1to8}, %ymm30
+
+// CHECK: vpermilps $123, -516(%rdx){1to8}, %ymm30
+// CHECK: encoding: [0x62,0x63,0x7d,0x38,0x04,0xb2,0xfc,0xfd,0xff,0xff,0x7b]
+ vpermilps $0x7b, -516(%rdx){1to8}, %ymm30
+
+// CHECK: vpermilps %xmm22, %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x22,0x1d,0x00,0x0c,0xe6]
+ vpermilps %xmm22, %xmm28, %xmm28
+
+// CHECK: vpermilps %xmm22, %xmm28, %xmm28 {%k6}
+// CHECK: encoding: [0x62,0x22,0x1d,0x06,0x0c,0xe6]
+ vpermilps %xmm22, %xmm28, %xmm28 {%k6}
+
+// CHECK: vpermilps %xmm22, %xmm28, %xmm28 {%k6} {z}
+// CHECK: encoding: [0x62,0x22,0x1d,0x86,0x0c,0xe6]
+ vpermilps %xmm22, %xmm28, %xmm28 {%k6} {z}
+
+// CHECK: vpermilps (%rcx), %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x00,0x0c,0x21]
+ vpermilps (%rcx), %xmm28, %xmm28
+
+// CHECK: vpermilps 291(%rax,%r14,8), %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x22,0x1d,0x00,0x0c,0xa4,0xf0,0x23,0x01,0x00,0x00]
+ vpermilps 291(%rax,%r14,8), %xmm28, %xmm28
+
+// CHECK: vpermilps (%rcx){1to4}, %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x10,0x0c,0x21]
+ vpermilps (%rcx){1to4}, %xmm28, %xmm28
+
+// CHECK: vpermilps 2032(%rdx), %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x00,0x0c,0x62,0x7f]
+ vpermilps 2032(%rdx), %xmm28, %xmm28
+
+// CHECK: vpermilps 2048(%rdx), %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x00,0x0c,0xa2,0x00,0x08,0x00,0x00]
+ vpermilps 2048(%rdx), %xmm28, %xmm28
+
+// CHECK: vpermilps -2048(%rdx), %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x00,0x0c,0x62,0x80]
+ vpermilps -2048(%rdx), %xmm28, %xmm28
+
+// CHECK: vpermilps -2064(%rdx), %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x00,0x0c,0xa2,0xf0,0xf7,0xff,0xff]
+ vpermilps -2064(%rdx), %xmm28, %xmm28
+
+// CHECK: vpermilps 508(%rdx){1to4}, %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x10,0x0c,0x62,0x7f]
+ vpermilps 508(%rdx){1to4}, %xmm28, %xmm28
+
+// CHECK: vpermilps 512(%rdx){1to4}, %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x10,0x0c,0xa2,0x00,0x02,0x00,0x00]
+ vpermilps 512(%rdx){1to4}, %xmm28, %xmm28
+
+// CHECK: vpermilps -512(%rdx){1to4}, %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x10,0x0c,0x62,0x80]
+ vpermilps -512(%rdx){1to4}, %xmm28, %xmm28
+
+// CHECK: vpermilps -516(%rdx){1to4}, %xmm28, %xmm28
+// CHECK: encoding: [0x62,0x62,0x1d,0x10,0x0c,0xa2,0xfc,0xfd,0xff,0xff]
+ vpermilps -516(%rdx){1to4}, %xmm28, %xmm28
+
+// CHECK: vpermilps %ymm21, %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x22,0x1d,0x20,0x0c,0xed]
+ vpermilps %ymm21, %ymm28, %ymm29
+
+// CHECK: vpermilps %ymm21, %ymm28, %ymm29 {%k2}
+// CHECK: encoding: [0x62,0x22,0x1d,0x22,0x0c,0xed]
+ vpermilps %ymm21, %ymm28, %ymm29 {%k2}
+
+// CHECK: vpermilps %ymm21, %ymm28, %ymm29 {%k2} {z}
+// CHECK: encoding: [0x62,0x22,0x1d,0xa2,0x0c,0xed]
+ vpermilps %ymm21, %ymm28, %ymm29 {%k2} {z}
+
+// CHECK: vpermilps (%rcx), %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x20,0x0c,0x29]
+ vpermilps (%rcx), %ymm28, %ymm29
+
+// CHECK: vpermilps 291(%rax,%r14,8), %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x22,0x1d,0x20,0x0c,0xac,0xf0,0x23,0x01,0x00,0x00]
+ vpermilps 291(%rax,%r14,8), %ymm28, %ymm29
+
+// CHECK: vpermilps (%rcx){1to8}, %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x30,0x0c,0x29]
+ vpermilps (%rcx){1to8}, %ymm28, %ymm29
+
+// CHECK: vpermilps 4064(%rdx), %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x20,0x0c,0x6a,0x7f]
+ vpermilps 4064(%rdx), %ymm28, %ymm29
+
+// CHECK: vpermilps 4096(%rdx), %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x20,0x0c,0xaa,0x00,0x10,0x00,0x00]
+ vpermilps 4096(%rdx), %ymm28, %ymm29
+
+// CHECK: vpermilps -4096(%rdx), %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x20,0x0c,0x6a,0x80]
+ vpermilps -4096(%rdx), %ymm28, %ymm29
+
+// CHECK: vpermilps -4128(%rdx), %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x20,0x0c,0xaa,0xe0,0xef,0xff,0xff]
+ vpermilps -4128(%rdx), %ymm28, %ymm29
+
+// CHECK: vpermilps 508(%rdx){1to8}, %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x30,0x0c,0x6a,0x7f]
+ vpermilps 508(%rdx){1to8}, %ymm28, %ymm29
+
+// CHECK: vpermilps 512(%rdx){1to8}, %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x30,0x0c,0xaa,0x00,0x02,0x00,0x00]
+ vpermilps 512(%rdx){1to8}, %ymm28, %ymm29
+
+// CHECK: vpermilps -512(%rdx){1to8}, %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x30,0x0c,0x6a,0x80]
+ vpermilps -512(%rdx){1to8}, %ymm28, %ymm29
+
+// CHECK: vpermilps -516(%rdx){1to8}, %ymm28, %ymm29
+// CHECK: encoding: [0x62,0x62,0x1d,0x30,0x0c,0xaa,0xfc,0xfd,0xff,0xff]
+ vpermilps -516(%rdx){1to8}, %ymm28, %ymm29
+
+// CHECK: vpermilpd $171, %xmm19, %xmm29
+// CHECK: encoding: [0x62,0x23,0xfd,0x08,0x05,0xeb,0xab]
+ vpermilpd $0xab, %xmm19, %xmm29
+
+// CHECK: vpermilpd $171, %xmm19, %xmm29 {%k7}
+// CHECK: encoding: [0x62,0x23,0xfd,0x0f,0x05,0xeb,0xab]
+ vpermilpd $0xab, %xmm19, %xmm29 {%k7}
+
+// CHECK: vpermilpd $171, %xmm19, %xmm29 {%k7} {z}
+// CHECK: encoding: [0x62,0x23,0xfd,0x8f,0x05,0xeb,0xab]
+ vpermilpd $0xab, %xmm19, %xmm29 {%k7} {z}
+
+// CHECK: vpermilpd $123, %xmm19, %xmm29
+// CHECK: encoding: [0x62,0x23,0xfd,0x08,0x05,0xeb,0x7b]
+ vpermilpd $0x7b, %xmm19, %xmm29
+
+// CHECK: vpermilpd $123, (%rcx), %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x08,0x05,0x29,0x7b]
+ vpermilpd $0x7b, (%rcx), %xmm29
+
+// CHECK: vpermilpd $123, 291(%rax,%r14,8), %xmm29
+// CHECK: encoding: [0x62,0x23,0xfd,0x08,0x05,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 291(%rax,%r14,8), %xmm29
+
+// CHECK: vpermilpd $123, (%rcx){1to2}, %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x18,0x05,0x29,0x7b]
+ vpermilpd $0x7b, (%rcx){1to2}, %xmm29
+
+// CHECK: vpermilpd $123, 2032(%rdx), %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x08,0x05,0x6a,0x7f,0x7b]
+ vpermilpd $0x7b, 2032(%rdx), %xmm29
+
+// CHECK: vpermilpd $123, 2048(%rdx), %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x08,0x05,0xaa,0x00,0x08,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 2048(%rdx), %xmm29
+
+// CHECK: vpermilpd $123, -2048(%rdx), %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x08,0x05,0x6a,0x80,0x7b]
+ vpermilpd $0x7b, -2048(%rdx), %xmm29
+
+// CHECK: vpermilpd $123, -2064(%rdx), %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x08,0x05,0xaa,0xf0,0xf7,0xff,0xff,0x7b]
+ vpermilpd $0x7b, -2064(%rdx), %xmm29
+
+// CHECK: vpermilpd $123, 1016(%rdx){1to2}, %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x18,0x05,0x6a,0x7f,0x7b]
+ vpermilpd $0x7b, 1016(%rdx){1to2}, %xmm29
+
+// CHECK: vpermilpd $123, 1024(%rdx){1to2}, %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x18,0x05,0xaa,0x00,0x04,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 1024(%rdx){1to2}, %xmm29
+
+// CHECK: vpermilpd $123, -1024(%rdx){1to2}, %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x18,0x05,0x6a,0x80,0x7b]
+ vpermilpd $0x7b, -1024(%rdx){1to2}, %xmm29
+
+// CHECK: vpermilpd $123, -1032(%rdx){1to2}, %xmm29
+// CHECK: encoding: [0x62,0x63,0xfd,0x18,0x05,0xaa,0xf8,0xfb,0xff,0xff,0x7b]
+ vpermilpd $0x7b, -1032(%rdx){1to2}, %xmm29
+
+// CHECK: vpermilpd $171, %ymm24, %ymm17
+// CHECK: encoding: [0x62,0x83,0xfd,0x28,0x05,0xc8,0xab]
+ vpermilpd $0xab, %ymm24, %ymm17
+
+// CHECK: vpermilpd $171, %ymm24, %ymm17 {%k6}
+// CHECK: encoding: [0x62,0x83,0xfd,0x2e,0x05,0xc8,0xab]
+ vpermilpd $0xab, %ymm24, %ymm17 {%k6}
+
+// CHECK: vpermilpd $171, %ymm24, %ymm17 {%k6} {z}
+// CHECK: encoding: [0x62,0x83,0xfd,0xae,0x05,0xc8,0xab]
+ vpermilpd $0xab, %ymm24, %ymm17 {%k6} {z}
+
+// CHECK: vpermilpd $123, %ymm24, %ymm17
+// CHECK: encoding: [0x62,0x83,0xfd,0x28,0x05,0xc8,0x7b]
+ vpermilpd $0x7b, %ymm24, %ymm17
+
+// CHECK: vpermilpd $123, (%rcx), %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x28,0x05,0x09,0x7b]
+ vpermilpd $0x7b, (%rcx), %ymm17
+
+// CHECK: vpermilpd $123, 291(%rax,%r14,8), %ymm17
+// CHECK: encoding: [0x62,0xa3,0xfd,0x28,0x05,0x8c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 291(%rax,%r14,8), %ymm17
+
+// CHECK: vpermilpd $123, (%rcx){1to4}, %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x38,0x05,0x09,0x7b]
+ vpermilpd $0x7b, (%rcx){1to4}, %ymm17
+
+// CHECK: vpermilpd $123, 4064(%rdx), %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x28,0x05,0x4a,0x7f,0x7b]
+ vpermilpd $0x7b, 4064(%rdx), %ymm17
+
+// CHECK: vpermilpd $123, 4096(%rdx), %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x28,0x05,0x8a,0x00,0x10,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 4096(%rdx), %ymm17
+
+// CHECK: vpermilpd $123, -4096(%rdx), %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x28,0x05,0x4a,0x80,0x7b]
+ vpermilpd $0x7b, -4096(%rdx), %ymm17
+
+// CHECK: vpermilpd $123, -4128(%rdx), %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x28,0x05,0x8a,0xe0,0xef,0xff,0xff,0x7b]
+ vpermilpd $0x7b, -4128(%rdx), %ymm17
+
+// CHECK: vpermilpd $123, 1016(%rdx){1to4}, %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x38,0x05,0x4a,0x7f,0x7b]
+ vpermilpd $0x7b, 1016(%rdx){1to4}, %ymm17
+
+// CHECK: vpermilpd $123, 1024(%rdx){1to4}, %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x38,0x05,0x8a,0x00,0x04,0x00,0x00,0x7b]
+ vpermilpd $0x7b, 1024(%rdx){1to4}, %ymm17
+
+// CHECK: vpermilpd $123, -1024(%rdx){1to4}, %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x38,0x05,0x4a,0x80,0x7b]
+ vpermilpd $0x7b, -1024(%rdx){1to4}, %ymm17
+
+// CHECK: vpermilpd $123, -1032(%rdx){1to4}, %ymm17
+// CHECK: encoding: [0x62,0xe3,0xfd,0x38,0x05,0x8a,0xf8,0xfb,0xff,0xff,0x7b]
+ vpermilpd $0x7b, -1032(%rdx){1to4}, %ymm17
+
+// CHECK: vpermilpd %xmm17, %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x22,0xa5,0x00,0x0d,0xd1]
+ vpermilpd %xmm17, %xmm27, %xmm26
+
+// CHECK: vpermilpd %xmm17, %xmm27, %xmm26 {%k2}
+// CHECK: encoding: [0x62,0x22,0xa5,0x02,0x0d,0xd1]
+ vpermilpd %xmm17, %xmm27, %xmm26 {%k2}
+
+// CHECK: vpermilpd %xmm17, %xmm27, %xmm26 {%k2} {z}
+// CHECK: encoding: [0x62,0x22,0xa5,0x82,0x0d,0xd1]
+ vpermilpd %xmm17, %xmm27, %xmm26 {%k2} {z}
+
+// CHECK: vpermilpd (%rcx), %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x00,0x0d,0x11]
+ vpermilpd (%rcx), %xmm27, %xmm26
+
+// CHECK: vpermilpd 291(%rax,%r14,8), %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x22,0xa5,0x00,0x0d,0x94,0xf0,0x23,0x01,0x00,0x00]
+ vpermilpd 291(%rax,%r14,8), %xmm27, %xmm26
+
+// CHECK: vpermilpd (%rcx){1to2}, %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x10,0x0d,0x11]
+ vpermilpd (%rcx){1to2}, %xmm27, %xmm26
+
+// CHECK: vpermilpd 2032(%rdx), %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x00,0x0d,0x52,0x7f]
+ vpermilpd 2032(%rdx), %xmm27, %xmm26
+
+// CHECK: vpermilpd 2048(%rdx), %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x00,0x0d,0x92,0x00,0x08,0x00,0x00]
+ vpermilpd 2048(%rdx), %xmm27, %xmm26
+
+// CHECK: vpermilpd -2048(%rdx), %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x00,0x0d,0x52,0x80]
+ vpermilpd -2048(%rdx), %xmm27, %xmm26
+
+// CHECK: vpermilpd -2064(%rdx), %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x00,0x0d,0x92,0xf0,0xf7,0xff,0xff]
+ vpermilpd -2064(%rdx), %xmm27, %xmm26
+
+// CHECK: vpermilpd 1016(%rdx){1to2}, %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x10,0x0d,0x52,0x7f]
+ vpermilpd 1016(%rdx){1to2}, %xmm27, %xmm26
+
+// CHECK: vpermilpd 1024(%rdx){1to2}, %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x10,0x0d,0x92,0x00,0x04,0x00,0x00]
+ vpermilpd 1024(%rdx){1to2}, %xmm27, %xmm26
+
+// CHECK: vpermilpd -1024(%rdx){1to2}, %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x10,0x0d,0x52,0x80]
+ vpermilpd -1024(%rdx){1to2}, %xmm27, %xmm26
+
+// CHECK: vpermilpd -1032(%rdx){1to2}, %xmm27, %xmm26
+// CHECK: encoding: [0x62,0x62,0xa5,0x10,0x0d,0x92,0xf8,0xfb,0xff,0xff]
+ vpermilpd -1032(%rdx){1to2}, %xmm27, %xmm26
+
+// CHECK: vpermilpd %ymm24, %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x02,0xad,0x20,0x0d,0xd0]
+ vpermilpd %ymm24, %ymm26, %ymm26
+
+// CHECK: vpermilpd %ymm24, %ymm26, %ymm26 {%k5}
+// CHECK: encoding: [0x62,0x02,0xad,0x25,0x0d,0xd0]
+ vpermilpd %ymm24, %ymm26, %ymm26 {%k5}
+
+// CHECK: vpermilpd %ymm24, %ymm26, %ymm26 {%k5} {z}
+// CHECK: encoding: [0x62,0x02,0xad,0xa5,0x0d,0xd0]
+ vpermilpd %ymm24, %ymm26, %ymm26 {%k5} {z}
+
+// CHECK: vpermilpd (%rcx), %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x20,0x0d,0x11]
+ vpermilpd (%rcx), %ymm26, %ymm26
+
+// CHECK: vpermilpd 291(%rax,%r14,8), %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x22,0xad,0x20,0x0d,0x94,0xf0,0x23,0x01,0x00,0x00]
+ vpermilpd 291(%rax,%r14,8), %ymm26, %ymm26
+
+// CHECK: vpermilpd (%rcx){1to4}, %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x30,0x0d,0x11]
+ vpermilpd (%rcx){1to4}, %ymm26, %ymm26
+
+// CHECK: vpermilpd 4064(%rdx), %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x20,0x0d,0x52,0x7f]
+ vpermilpd 4064(%rdx), %ymm26, %ymm26
+
+// CHECK: vpermilpd 4096(%rdx), %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x20,0x0d,0x92,0x00,0x10,0x00,0x00]
+ vpermilpd 4096(%rdx), %ymm26, %ymm26
+
+// CHECK: vpermilpd -4096(%rdx), %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x20,0x0d,0x52,0x80]
+ vpermilpd -4096(%rdx), %ymm26, %ymm26
+
+// CHECK: vpermilpd -4128(%rdx), %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x20,0x0d,0x92,0xe0,0xef,0xff,0xff]
+ vpermilpd -4128(%rdx), %ymm26, %ymm26
+
+// CHECK: vpermilpd 1016(%rdx){1to4}, %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x30,0x0d,0x52,0x7f]
+ vpermilpd 1016(%rdx){1to4}, %ymm26, %ymm26
+
+// CHECK: vpermilpd 1024(%rdx){1to4}, %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x30,0x0d,0x92,0x00,0x04,0x00,0x00]
+ vpermilpd 1024(%rdx){1to4}, %ymm26, %ymm26
+
+// CHECK: vpermilpd -1024(%rdx){1to4}, %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x30,0x0d,0x52,0x80]
+ vpermilpd -1024(%rdx){1to4}, %ymm26, %ymm26
+
+// CHECK: vpermilpd -1032(%rdx){1to4}, %ymm26, %ymm26
+// CHECK: encoding: [0x62,0x62,0xad,0x30,0x0d,0x92,0xf8,0xfb,0xff,0xff]
+ vpermilpd -1032(%rdx){1to4}, %ymm26, %ymm26
+
// CHECK: vcvtpd2dq %xmm20, %xmm25
// CHECK: encoding: [0x62,0x21,0xff,0x08,0xe6,0xcc]
vcvtpd2dq %xmm20, %xmm25