X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrAVX512.td;h=463ab6232702374a785d0c4047bfedd2ee108a5e;hb=dfe88a08c7f595801e733d3e6f4504d2695d5309;hp=00b00aa75a117311930ea13d6610c6947b0d390d;hpb=33a95f24bbb55a53eb13a4a589c7d50f28b322c0;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 00b00aa75a1..463ab623270 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2,9 +2,12 @@ // EltVT). These are things like the register class for the writemask, etc. // The idea is to pass one of these as the template argument rather than the // individual arguments. -class X86VectorVTInfo { RegisterClass RC = rc; + ValueType EltVT = eltvt; + int NumElts = numelts; // Corresponding mask register class. RegisterClass KRC = !cast("VK" # NumElts); @@ -22,7 +25,13 @@ class X86VectorVTInfo(VTName); @@ -56,9 +65,11 @@ class X86VectorVTInfo("memopfsf32"), + !if (!eq (NumElts#EltTypeName, "1f64"), !cast("memopfsf64"), !if (!eq (TypeVariantName, "f"), !cast("memop" # VTName), !if (!eq (EltTypeName, "i64"), !cast("memop" # VTName), - !if (!eq (VTName, "v16i32"), !cast("memop" # VTName), ?))); + !if (!eq (VTName, "v16i32"), !cast("memop" # VTName), ?))))); // The corresponding float type, e.g. v16f32 for v16i32 // Note: For EltSize < 32, FloatVT is illegal and TableGen @@ -73,6 +84,11 @@ class X86VectorVTInfo("CD8VT" # NumElts), ?); + SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm, !if (!eq (Size, 256), sub_ymm, ?)); @@ -98,11 +114,20 @@ def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">; def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; +def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; +def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">; def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; +def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; +def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; + +// We map scalar types to the smallest (128-bit) vector type +// with the appropriate element type. This allows to use the same masking logic. +def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; +def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; class AVX512VLVectorVTInfo { @@ -119,6 +144,10 @@ def avx512vl_i32_info : AVX512VLVectorVTInfo; def avx512vl_i64_info : AVX512VLVectorVTInfo; +def avx512vl_f32_info : AVX512VLVectorVTInfo; +def avx512vl_f64_info : AVX512VLVectorVTInfo; // This multiclass generates the masking variants from the non-masking // variant. It only provides the assembly pieces for the masking variants. @@ -132,20 +161,21 @@ multiclass AVX512_maskable_custom O, Format F, list Pattern, list MaskingPattern, list ZeroMaskingPattern, + string Round = "", string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512; // Prefer over VMOV*rrk Pat<> let AddedComplexity = 20 in def NAME#k: AVX512, EVEX_K { // In case of the 3src subclass this is overridden with a let. @@ -153,8 +183,8 @@ multiclass AVX512_maskable_custom O, Format F, } let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512, EVEX_KZ; @@ -168,6 +198,7 @@ multiclass AVX512_maskable_common O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, + SDNode Select = vselect, string Round = "", string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : @@ -176,23 +207,39 @@ multiclass AVX512_maskable_common O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, RHS)], [(set _.RC:$dst, MaskingRHS)], [(set _.RC:$dst, - (vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))], - MaskingConstraint, NoItinerary, IsCommutable>; + (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], + Round, MaskingConstraint, NoItinerary, IsCommutable>; // This multiclass generates the unconditional/non-masking, the masking and -// the zero-masking variant of the instruction. In the masking case, the +// the zero-masking variant of the vector instruction. In the masking case, the // perserved vector elements come from a new dummy input operand tied to $dst. multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, InstrItinClass itin = NoItinerary, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : AVX512_maskable_common; + (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect, + Round, "$src0 = $dst", itin, IsCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the scalar instruction. +multiclass AVX512_maskable_scalar O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -218,7 +265,7 @@ multiclass AVX512_maskable_in_asm O, Format F, X86VectorVTInfo _, AVX512_maskable_custom; // Bitcasts between 512-bit vector types. Return the original type since @@ -341,15 +388,15 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; // AVX-512 - VECTOR INSERT // -multiclass vinsert_for_size { +multiclass vinsert_for_size_no_alt { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { def rr : AVX512AIi8, EVEX_4V, EVEX_V512, EVEX_CD8; + []>, + EVEX_4V, EVEX_V512, EVEX_CD8; } - - // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for - // vinserti32x4 - def : Pat<(vinsert_insert:$ins - (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), - (AltTo.VT (!cast(NAME # From.EltSize # "x4rr") - VR512:$src1, From.RC:$src2, - (INSERT_get_vinsert_imm VR512:$ins)))>; } -multiclass vinsert_for_type { - defm NAME # "32x4" : vinsert_for_size : + vinsert_for_size_no_alt { + // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for + // vinserti32x4. Only add this if 64x2 and friends are not supported + // natively via AVX512DQ. + let Predicates = [NoDQI] in + def : Pat<(vinsert_insert:$ins + (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), + (AltTo.VT (!cast(NAME # From.EltSize # "x4rr") + VR512:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm VR512:$ins)))>; +} + +multiclass vinsert_for_type { + defm NAME # "32x4" : vinsert_for_size, X86VectorVTInfo<16, EltVT32, VR512>, X86VectorVTInfo< 2, EltVT64, VR128X>, X86VectorVTInfo< 8, EltVT64, VR512>, vinsert128_insert, INSERT_get_vinsert128_imm>; - defm NAME # "64x4" : vinsert_for_size, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>, VEX_W; + defm NAME # "64x4" : vinsert_for_size, X86VectorVTInfo< 8, EltVT64, VR512>, X86VectorVTInfo< 8, EltVT32, VR256>, X86VectorVTInfo<16, EltVT32, VR512>, vinsert256_insert, INSERT_get_vinsert256_imm>, VEX_W; + let Predicates = [HasDQI] in + defm NAME # "32x8" : vinsert_for_size_no_alt, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>; } defm VINSERTF : vinsert_for_type; @@ -539,105 +610,175 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), //===---------------------------------------------------------------------===// // AVX-512 BROADCAST //--- -multiclass avx512_fp_broadcast opc, string OpcodeStr, - RegisterClass DestRC, - RegisterClass SrcRC, X86MemOperand x86memop> { - def rr : AVX5128I, EVEX; - def rm : AVX5128I, EVEX; +multiclass avx512_fp_broadcast opc, SDNode OpNode, RegisterClass SrcRC, + ValueType svt, X86VectorVTInfo _> { + defm r : AVX512_maskable, + T8PD, EVEX; + + let mayLoad = 1 in { + defm m : AVX512_maskable, + T8PD, EVEX; + } } + +multiclass avx512_fp_broadcast_vl opc, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_fp_broadcast, + EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_fp_broadcast, + EVEX_V256; + } +} + let ExeDomain = SSEPackedSingle in { - defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss", VR512, - VR128X, f32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast, + avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>; + let Predicates = [HasVLX] in { + defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X, + v4f32, v4f32x_info>, EVEX_V128, + EVEX_CD8<32, CD8VT1>; + } } let ExeDomain = SSEPackedDouble in { - defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd", VR512, - VR128X, f64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast, + avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. +// Later, we can canonize broadcast instructions before ISel phase and +// eliminate additional patterns on ISel. +// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar +// representations of source +multiclass avx512_broadcast_pat { + def : Pat<(_.VT (OpNode (_.EltVT SrcRC_s:$src))), + (!cast(InstName##"r") + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + let AddedComplexity = 30 in { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)), + (!cast(InstName##"rk") _.RC:$src0, _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + def : Pat<(_.VT(vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)), + (!cast(InstName##"rkz") _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + } +} + +defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info, + VR128X, FR32X>; +defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info, + VR128X, FR64X>; + +let Predicates = [HasVLX] in { + defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast, + v8f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast, + v4f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast, + v4f64x_info, VR128X, FR64X>; } def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDZrm addr:$src)>; + (VBROADCASTSDZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), - (VBROADCASTSDZrm addr:$src)>; - -multiclass avx512_int_broadcast_reg opc, string OpcodeStr, - RegisterClass SrcRC, RegisterClass KRC> { - def Zrr : AVX5128I, EVEX, EVEX_V512; - def Zkrr : AVX5128I, EVEX, EVEX_V512, EVEX_KZ; -} - -defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; -defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, - VEX_W; - + (VBROADCASTSDZm addr:$src)>; + +multiclass avx512_int_broadcast_reg opc, X86VectorVTInfo _, + RegisterClass SrcRC> { + defm r : AVX512_maskable_in_asm, T8PD, EVEX; +} + +multiclass avx512_int_broadcast_reg_vl opc, AVX512VLVectorVTInfo _, + RegisterClass SrcRC, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_int_broadcast_reg, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_int_broadcast_reg, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg, EVEX_V128; + } +} + +defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32, + HasBWI>; +defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32, + HasBWI>; +defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32, + HasAVX512>; +defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64, + HasAVX512>, VEX_W; + def : Pat <(v16i32 (X86vzext VK16WM:$mask)), - (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>; def : Pat <(v8i64 (X86vzext VK8WM:$mask)), - (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>; def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))), - (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>; + (VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>; def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))), - (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>; + (VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src), (v16i32 immAllZerosV), (i16 GR16:$mask))), - (VPBROADCASTDrZkrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; + (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), - (VPBROADCASTQrZkrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; + (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; multiclass avx512_int_broadcast_rm opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, RegisterClass KRC> { def rr : AVX5128I, EVEX; def krr : AVX5128I, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I, EVEX; def krm : AVX5128I, EVEX, EVEX_KZ; } } @@ -654,12 +795,12 @@ multiclass avx512_int_subvec_broadcast_rm opc, string OpcodeStr, RegisterClass KRC> { let mayLoad = 1 in { def rm : AVX5128I, EVEX; def krm : AVX5128I, EVEX, EVEX_KZ; } } @@ -676,27 +817,32 @@ def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), (VPBROADCASTQZrr VR128X:$src)>; -def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; -def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; + +def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), + (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), + (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; + (VBROADCASTSSZr VR128X:$src)>; def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; - + (VBROADCASTSDZr VR128X:$src)>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), - (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), - (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; let Predicates = [HasAVX512] in { def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), addr:$src)), sub_ymm)>; } @@ -705,64 +851,107 @@ def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), //--- multiclass avx512_mask_broadcast opc, string OpcodeStr, - RegisterClass DstRC, RegisterClass KRC, - ValueType OpVT, ValueType SrcVT> { -def rr : AVX512XS8I, EVEX; + RegisterClass KRC> { +let Predicates = [HasCDI] in +def Zrr : AVX512XS8I, EVEX, EVEX_V512; + +let Predicates = [HasCDI, HasVLX] in { +def Z128rr : AVX512XS8I, EVEX, EVEX_V128; +def Z256rr : AVX512XS8I, EVEX, EVEX_V256; +} } let Predicates = [HasCDI] in { -defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, - VK16, v16i32, v16i1>, EVEX_V512; -defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, - VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", + VK16>; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", + VK8>, VEX_W; } //===----------------------------------------------------------------------===// // AVX-512 - VPERM // // -- immediate form -- -multiclass avx512_perm_imm opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT> { - def ri : AVX512AIi8 opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def ri : AVX512AIi8, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, EVEX; - def mi : AVX512AIi8, EVEX; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (OpNode (_.MemOpFrag addr:$src1), + (i8 imm:$src2))))]>, + EVEX, EVEX_CD8<_.EltSize, CD8VF>; +} +} + +multiclass avx512_permil OpcImm, bits<8> OpcVar, X86VectorVTInfo _, + X86VectorVTInfo Ctrl> : + avx512_perm_imm { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I, + EVEX_4V; + def rm : AVX5128I, + EVEX_4V; + } } -defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64, - i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, - f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", X86VPermi, v8i64_info>, + EVEX_V512, VEX_W; +defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", X86VPermi, v8f64_info>, + EVEX_V512, VEX_W; + +defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, + EVEX_V512; +defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, + EVEX_V512, VEX_W; + +def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPSZri VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPDZri VR512:$src1, imm:$imm)>; // -- VPERM - register form -- -multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, +multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def rr : AVX5128I, EVEX_4V; def rm : AVX5128I, EVEX_4V; @@ -770,13 +959,13 @@ multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let ExeDomain = SSEPackedSingle in defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; // -- VPERM2I - 3 source operands form -- @@ -787,7 +976,7 @@ let Constraints = "$src1 = $dst" in { def rr : AVX5128I, EVEX_4V; @@ -795,7 +984,7 @@ let Constraints = "$src1 = $dst" in { def rrk : AVX5128I, EVEX_4V; @@ -827,7 +1016,7 @@ let Constraints = "$src1 = $dst" in { def rmk : AVX5128I opc, string OpcodeStr, - RegisterClass KRC, RegisterClass RC, - X86MemOperand x86memop, PatFrag mem_frag, - SDNode OpNode, ValueType vt> { - def rr : AVX5128I opc, string OpcodeStr, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I, EVEX_4V, EVEX_K; - let mayLoad = 1 in - def rm : AVX5128I, EVEX_4V; + def rrk : AVX5128I, EVEX_4V, EVEX_K; + def rrkz : AVX5128I, EVEX_4V, EVEX_K; + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ; + let mayLoad = 1 in { + def rm : AVX5128I, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + def rmk : AVX5128I, + EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + def rmkz : AVX5128I, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + } + } } +multiclass avx512_blendmask_rmb opc, string OpcodeStr, X86VectorVTInfo _> { + + def rmbk : AVX5128I, + EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + + def rmb : AVX5128I, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + +} + +multiclass blendmask_dq opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : avx512_blendmask , + avx512_blendmask_rmb , EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_blendmask, + avx512_blendmask_rmb , EVEX_V256; + defm Z128 : avx512_blendmask, + avx512_blendmask_rmb , EVEX_V128; + } +} + +multiclass blendmask_bw opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm Z : avx512_blendmask , EVEX_V512; + + let Predicates = [HasBWI, HasVLX] in { + defm Z256 : avx512_blendmask , EVEX_V256; + defm Z128 : avx512_blendmask , EVEX_V128; + } +} + + +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; -let ExeDomain = SSEPackedSingle in -defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", - VK16WM, VR512, f512mem, - memopv16f32, vselect, v16f32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; -let ExeDomain = SSEPackedDouble in -defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", - VK8WM, VR512, f512mem, - memopv8f64, vselect, v8f64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (i16 GR16:$mask))), - (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (i8 GR8:$mask))), - (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), - VR512:$src1, VR512:$src2)>; - -defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", - VK16WM, VR512, f512mem, - memopv16i32, vselect, v16i32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; - -defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", - VK8WM, VR512, f512mem, - memopv8i64, vselect, v8i64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (i16 GR16:$mask))), - (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (i8 GR8:$mask))), - (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8), - VR512:$src1, VR512:$src2)>; let Predicates = [HasAVX512] in { def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), (v8f32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; } @@ -1304,17 +1526,17 @@ multiclass avx512_cmp_packed; def rrib: AVX512PIi8<0xC2, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), + "\t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), [], d>, EVEX_B; def rmi : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set KRC:$dst, (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>; @@ -1323,11 +1545,11 @@ multiclass avx512_cmp_packed; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), !strconcat("vcmp", suffix, - " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; } } @@ -1355,25 +1577,25 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), imm:$cc), VK8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; - + def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; @@ -1388,14 +1610,14 @@ multiclass avx512_mask_mov opc_kk, bits<8> opc_km, bits<8> opc_mk, ValueType vvt, ValueType ivt, X86MemOperand x86memop> { let hasSideEffects = 0 in { def kk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; let mayLoad = 1 in def km : I; let mayStore = 1 in def mk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } } @@ -1404,9 +1626,9 @@ multiclass avx512_mask_mov_gpr opc_kr, bits<8> opc_rk, RegisterClass KRC, RegisterClass GRC> { let hasSideEffects = 0 in { def kr : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; def rk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } } @@ -1556,7 +1778,7 @@ multiclass avx512_mask_unop opc, string OpcodeStr, Predicate prd> { let Predicates = [prd] in def rr : I; } @@ -1610,7 +1832,7 @@ multiclass avx512_mask_binop opc, string OpcodeStr, let Predicates = [prd] in def rr : I; } @@ -1686,7 +1908,7 @@ multiclass avx512_mask_unpck opc, string OpcodeStr, let Predicates = [HasAVX512] in def rr : I; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; } multiclass avx512_mask_unpck_bw opc, string OpcodeStr> { @@ -1715,7 +1937,7 @@ multiclass avx512_mask_testop opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode> { let Predicates = [HasAVX512], Defs = [EFLAGS] in def rr : I; } @@ -1736,7 +1958,7 @@ multiclass avx512_mask_shiftop opc, string OpcodeStr, RegisterClass KRC, let Predicates = [HasAVX512] in def ri : Ii8; } @@ -1891,7 +2113,7 @@ multiclass avx512_load_vl opc, string OpcodeStr, string ld_pat, multiclass avx512_store opc, string OpcodeStr, PatFrag st_frag, ValueType OpVT, RegisterClass KRC, RegisterClass RC, X86MemOperand memop, Domain d> { - let isAsmParserOnly = 1, hasSideEffects = 0 in { + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_alt : AVX512PI, EVEX; @@ -1987,6 +2209,46 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), + (VMOVUPSZmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), + (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), + (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, + (bc_v16f32 (v16i32 immAllZerosV)))), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), + (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8f64 (v16i32 immAllZerosV)))), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), + (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", "16", "8", "4", SSEPackedInt, HasAVX512>, avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", @@ -2061,6 +2323,46 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), + (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), + (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), + (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), + (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +// SKX replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; + +// KNL replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Zmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + + // Move Int Doubleword to Packed Double Int // def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), @@ -2167,12 +2469,12 @@ def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar { let hasSideEffects = 0 in { - def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), - !strconcat(asm, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128X:$dst, (vt (OpNode VR128X:$src1, (scalar_to_vector RC:$src2))))], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; @@ -2180,19 +2482,19 @@ multiclass avx512_move_scalar , EVEX_4V, VEX_LIG, EVEX_K; def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, EVEX, VEX_LIG; let mayStore = 1 in { def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG; def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), - !strconcat(asm, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG, EVEX_K; } // mayStore @@ -2249,7 +2551,7 @@ let Predicates = [HasAVX512] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (V_SET0)), (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), @@ -2378,7 +2680,7 @@ let AddedComplexity = 15 in def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128X:$dst, (v2i64 (X86vzmovl + [(set VR128X:$dst, (v2i64 (X86vzmovl (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; @@ -2400,7 +2702,7 @@ let Predicates = [HasAVX512] in { (VMOV64toPQIZrr GR64:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIZrr GR32:$src)>; - + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), @@ -2514,7 +2816,7 @@ multiclass avx512_binop_rm opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), - itins.rr, IsCommutable>, + "", itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in @@ -2523,7 +2825,7 @@ multiclass avx512_binop_rm opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), - itins.rm>, + "", itins.rm>, AVX512BIBase, EVEX_4V; } @@ -2539,7 +2841,7 @@ multiclass avx512_binop_rmb opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - itins.rm>, + "", itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; } @@ -2641,48 +2943,48 @@ multiclass avx512_binop_rm2 opc, string OpcodeStr, ValueType DstVT, { def rr : AVX512BI, EVEX_4V; def rrk : AVX512BI, EVEX_4V, EVEX_K; def rrkz : AVX512BI, EVEX_4V, EVEX_KZ; } let mayLoad = 1 in { def rm : AVX512BI, EVEX_4V; def rmk : AVX512BI, EVEX_4V, EVEX_K; def rmkz : AVX512BI, EVEX_4V, EVEX_KZ; def rmb : AVX512BI, EVEX_4V, EVEX_B; def rmbk : AVX512BI, EVEX_4V, EVEX_B, EVEX_K; def rmbkz : AVX512BI, EVEX_4V, EVEX_B, EVEX_KZ; @@ -2697,6 +2999,8 @@ defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, memopv8i64, i512mem, loadi64, i64mem, "{1to8}", @@ -2808,12 +3112,12 @@ multiclass avx512_unpack_int opc, string OpcodeStr, SDNode OpNode, X86MemOperand x86memop> { def rr : AVX512BI, EVEX_4V; def rm : AVX512BI, EVEX_4V; @@ -2835,19 +3139,19 @@ defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, // multiclass avx512_pshuf_imm opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, + SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def ri : AVX512Ii8, EVEX; def mi : AVX512Ii8, EVEX; @@ -2856,20 +3160,6 @@ multiclass avx512_pshuf_imm opc, string OpcodeStr, RegisterClass RC, defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedSingle in -defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilpi, - memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512, - EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilpi, - memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512, - VEX_W, EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// @@ -2909,118 +3199,58 @@ defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; } multiclass avx512_fp_packed opc, string OpcodeStr, SDNode OpNode, - RegisterClass KRC, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag, - X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, - Domain d, OpndItins itins, bit commutable> { - let isCommutable = commutable in { - def rr : PI, - EVEX_4V; - - def rrk: PI, EVEX_4V, EVEX_K; - - def rrkz: PI, EVEX_4V, EVEX_KZ; - } - + X86VectorVTInfo _, bit IsCommutable> { + defm rr: AVX512_maskable, EVEX_4V; let mayLoad = 1 in { - def rm : PI, EVEX_4V; - - def rmb : PI, EVEX_4V, EVEX_B; - - def rmk : PI, EVEX_4V, EVEX_K; - - def rmkz : PI, EVEX_4V, EVEX_KZ; - - def rmbk : PI, EVEX_4V, EVEX_B, EVEX_K; - - def rmbkz : PI, EVEX_4V, EVEX_B, EVEX_KZ; + defm rm: AVX512_maskable, EVEX_4V; + defm rmb: AVX512_maskable, + EVEX_4V, EVEX_B; + }//let mayLoad = 1 +} + +multiclass avx512_fp_binop_p opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable = 0> { + defm PSZ : avx512_fp_packed, EVEX_V512, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_packed, EVEX_V512, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_packed, EVEX_V128, PS, + EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_packed, EVEX_V256, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_packed, EVEX_V128, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_packed, EVEX_V256, PD, VEX_W, + EVEX_CD8<64, CD8VF>; } } -defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>; def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), @@ -3045,18 +3275,18 @@ def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, +multiclass avx512_vptest opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, SDNode OpNode, ValueType vt> { def rr : AVX512PI, EVEX_4V; def rm : AVX512PI, EVEX_4V; } @@ -3083,154 +3313,122 @@ def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1), def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>; + //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, RegisterClass RC, - ValueType vt, X86MemOperand x86memop, PatFrag mem_frag, - RegisterClass KRC> { - def ri : AVX512BIi8, EVEX_4V; - def rik : AVX512BIi8, EVEX_4V, EVEX_K; - def mi: AVX512BIi8, EVEX_4V; - def mik: AVX512BIi8, EVEX_4V, EVEX_K; + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + defm ri : AVX512_maskable, AVX512BIi8Base, EVEX_4V; + defm mi : AVX512_maskable, AVX512BIi8Base, EVEX_4V; } multiclass avx512_shift_rrm opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, ValueType SrcVT, - PatFrag bc_frag, RegisterClass KRC> { - // src2 is always 128-bit - def rr : AVX512BI, EVEX_4V; - def rrk : AVX512BI, EVEX_4V, EVEX_K; - def rm : AVX512BI, EVEX_4V; - def rmk : AVX512BI, EVEX_4V, EVEX_K; + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + // src2 is always 128-bit + defm rr : AVX512_maskable, AVX512BIBase, EVEX_4V; + defm rm : AVX512_maskable, AVX512BIBase, EVEX_4V; +} + +multiclass avx512_shift_sizes opc, string OpcodeStr, SDNode OpNode, + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + defm Z : avx512_shift_rrm, EVEX_V512; +} + +multiclass avx512_shift_types opcd, bits<8> opcq, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_shift_sizes, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_shift_sizes, EVEX_CD8<64, CD8VQ>, VEX_W; } defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// // Variable Bit Shifts //===-------------------------------------------------------------------===// multiclass avx512_var_shift opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag> { - def rr : AVX5128I, - EVEX_4V; - def rm : AVX5128I, - EVEX_4V; + X86VectorVTInfo _> { + defm rr : AVX512_maskable, AVX5128IBase, EVEX_4V; + defm rm : AVX512_maskable, AVX5128IBase, EVEX_4V; } -defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; +multiclass avx512_var_shift_sizes opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_var_shift, EVEX_V512; +} + +multiclass avx512_var_shift_types opc, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_var_shift_sizes, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_var_shift_sizes, EVEX_CD8<64, CD8VQ>, VEX_W; +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup { def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; } @@ -3247,11 +3445,11 @@ multiclass avx512_replicate_sfp op, SDNode OpNode, string OpcodeStr, ValueType vt, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop> { def rr : AVX512XSI, EVEX; let mayLoad = 1 in def rm : AVX512XSI, EVEX; } @@ -3298,9 +3496,11 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // + let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_rm opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { +// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. +multiclass avx512_fma3p_rm opc, string OpcodeStr, X86VectorVTInfo _, + SDPatternOperator OpNode = null_frag> { defm r: AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3; - def mb: AVX512FMA3, EVEX_B; -} + defm m: AVX512_maskable_3src, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src, + AVX512FMA3Base, EVEX_B; + } } // Constraints = "$src1 = $dst" -let ExeDomain = SSEPackedSingle in { - defm VFMADD213PSZ : avx512_fma3p_rm<0xA8, "vfmadd213ps", X86Fmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUB213PSZ : avx512_fma3p_rm<0xAA, "vfmsub213ps", X86Fmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", X86Fmaddsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", X86Fmsubadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD213PSZ : avx512_fma3p_rm<0xAC, "vfnmadd213ps", X86Fnmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMSUB213PSZ : avx512_fma3p_rm<0xAE, "vfnmsub213ps", X86Fnmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_fma3p_forms opc213, bits<8> opc231, + string OpcodeStr, X86VectorVTInfo VTI, + SDPatternOperator OpNode> { + defm v213r : avx512_fma3p_rm, EVEX_CD8; + + defm v231r : avx512_fma3p_rm, EVEX_CD8; } + +multiclass avx512_fma3p opc213, bits<8> opc231, + string OpcodeStr, + SDPatternOperator OpNode> { +let ExeDomain = SSEPackedSingle in { + defm NAME##PSZ : avx512_fma3p_forms, EVEX_V512; + defm NAME##PSZ256 : avx512_fma3p_forms, EVEX_V256; + defm NAME##PSZ128 : avx512_fma3p_forms, EVEX_V128; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD213PDZ : avx512_fma3p_rm<0xA8, "vfmadd213pd", X86Fmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB213PDZ : avx512_fma3p_rm<0xAA, "vfmsub213pd", X86Fmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", X86Fmaddsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", X86Fmsubadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", X86Fnmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", X86Fnmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_forms, EVEX_V512, VEX_W; + defm NAME##PDZ256 : avx512_fma3p_forms, EVEX_V256, VEX_W; + defm NAME##PDZ128 : avx512_fma3p_forms, EVEX_V128, VEX_W; + } } +defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd>; +defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub>; +defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub>; +defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd>; +defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd>; +defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub>; + let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_m132 opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { let mayLoad = 1 in def m: AVX512FMA3; def mb: AVX512FMA3 opc, string OpcodeStr, SDNode OpNode, } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_m132_f opc, + string OpcodeStr, + SDNode OpNode> { + let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -} + defm NAME##PSZ : avx512_fma3p_m132, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ256 : avx512_fma3p_m132, EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ128 : avx512_fma3p_m132, EVEX_V128, EVEX_CD8<32, CD8VF>; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_m132, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ256 : avx512_fma3p_m132, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ128 : avx512_fma3p_m132, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>; + } } +defm VFMADD132 : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>; +defm VFMSUB132 : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>; +defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>; +defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; +defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; +defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; + + // Scalar FMA let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3s_rm opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, +multiclass avx512_fma3s_rm opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType OpVT, + X86MemOperand x86memop, Operand memop, PatFrag mem_frag> { let isCommutable = 1 in def r : AVX512FMA3; let mayLoad = 1 in def m : AVX512FMA3; @@ -3475,12 +3661,12 @@ multiclass avx512_vcvtsi opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { let hasSideEffects = 0 in { def rr : SI, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; let mayLoad = 1 in def rm : SI, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; } // hasSideEffects = 0 } @@ -3548,12 +3734,12 @@ multiclass avx512_cvt_s_int opc, RegisterClass SrcRC, RegisterClass DstR string asm> { let hasSideEffects = 0 in { def rr : SI, EVEX, VEX_LIG, Requires<[HasAVX512]>; let mayLoad = 1 in def rm : SI, EVEX, VEX_LIG, + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, Requires<[HasAVX512]>; } // hasSideEffects = 0 } @@ -3651,10 +3837,10 @@ multiclass avx512_cvt_s opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm> { def rr : SI, EVEX; def rm : SI, EVEX; } @@ -3727,21 +3913,21 @@ def : Pat<(extloadf32 addr:$src), def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, Requires<[HasAVX512]>; -multiclass avx512_vcvt_fp_with_rc opc, string asm, RegisterClass SrcRC, - RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, +multiclass avx512_vcvt_fp_with_rc opc, string asm, RegisterClass SrcRC, + RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT, ValueType InVT, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; def rrb : AVX512PI, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI, EVEX; } // hasSideEffects = 0 @@ -3753,12 +3939,12 @@ multiclass avx512_vcvt_fp opc, string asm, RegisterClass SrcRC, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; let mayLoad = 1 in def rm : AVX512PI, EVEX; } // hasSideEffects = 0 @@ -3775,7 +3961,7 @@ defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; - + def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))), (VCVTPD2PSZrr VR512:$src)>; @@ -3804,7 +3990,7 @@ defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, - memopv8f64, f512mem, v8i32, v8f64, + memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3822,7 +4008,7 @@ defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uin memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PS, VEX_W, EVEX_CD8<64, CD8VF>; - + // cvttpd2udq (src, 0, mask-all-ones, sae-current) def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src), (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)), @@ -3832,16 +4018,16 @@ defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, memopv4i64, f256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; - + defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, memopv16i32, f512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, XD, EVEX_CD8<32, CD8VF>; def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), - (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3849,7 +4035,7 @@ def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3876,14 +4062,14 @@ multiclass avx512_vcvt_fp2int opc, string asm, RegisterClass SrcRC, X86MemOperand x86memop, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; def rrb : AVX512PI, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI, EVEX; } // hasSideEffects = 0 } @@ -3942,12 +4128,12 @@ multiclass avx512_cvtps2ph { def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), (ins srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; let hasSideEffects = 0, mayStore = 1 in def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; } defm VCVTPH2PSZ : avx512_cvtph2ps, EVEX_V512, @@ -3994,7 +4180,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } - + /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop> { @@ -4002,12 +4188,12 @@ multiclass avx512_fp14_s opc, string OpcodeStr, RegisterClass RC, def rr : AVX5128I, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; let mayLoad = 1 in { def rm : AVX5128I, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; } } } @@ -4043,26 +4229,49 @@ def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1), /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, ValueType OpVt> { - def r : AVX5128I, - EVEX; - def m : AVX5128I, - EVEX; -} -defm VRSQRT14PSZ : avx512_fp14_p<0x4E, "vrsqrt14ps", X86frsqrt, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT14PDZ : avx512_fp14_p<0x4E, "vrsqrt14pd", X86frsqrt, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP14PSZ : avx512_fp14_p<0x4C, "vrcp14ps", X86frcp, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP14PDZ : avx512_fp14_p<0x4C, "vrcp14pd", X86frcp, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + X86VectorVTInfo _> { + defm r: AVX512_maskable, EVEX, T8PD; + let mayLoad = 1 in { + defm m: AVX512_maskable, EVEX, T8PD; + defm mb: AVX512_maskable, + EVEX, T8PD, EVEX_B; + } +} + +multiclass avx512_fp14_p_vl_all opc, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp14_p, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp14_p, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp14_p, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp14_p, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp14_p, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp14_p, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} + +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), @@ -4079,126 +4288,100 @@ def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), (VRCP14PDZr VR512:$src)>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd -multiclass avx512_fp28_s opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def rr : AVX5128I, EVEX_4V; - def rrb : AVX5128I, EVEX_4V, EVEX_B; - let mayLoad = 1 in { - def rm : AVX5128I, EVEX_4V; - } +multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode> { + + defm r : AVX512_maskable_scalar; + + defm rb : AVX512_maskable_scalar, EVEX_B; + + defm m : AVX512_maskable_scalar; } + +multiclass avx512_eri_s opc, string OpcodeStr, SDNode OpNode> { + defm SS : avx512_fp28_s, + EVEX_CD8<32, CD8VT1>; + defm SD : avx512_fp28_s, + EVEX_CD8<64, CD8VT1>, VEX_W; } -defm VRCP28SS : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRCP28SD : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; -defm VRSQRT28SS : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRSQRT28SD : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; +let hasSideEffects = 0, Predicates = [HasERI] in { + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; +} +/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd -def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; +multiclass avx512_fp28_p opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode> { -def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; + defm r : AVX512_maskable; -def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; + defm rb : AVX512_maskable, EVEX_B; -def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; + defm m : AVX512_maskable; -/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd -multiclass avx512_fp28_p opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def r : AVX5128I, EVEX; - def rb : AVX5128I, EVEX, EVEX_B; - def m : AVX5128I, EVEX; - } + defm mb : AVX512_maskable, EVEX_B; } -defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRSQRT28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRSQRT28PDZrb VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRCP28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRCP28PDZrb VR512:$src)>; - -multiclass avx512_sqrt_packed opc, string OpcodeStr, SDNode OpNode, - OpndItins itins_s, OpndItins itins_d> { - def PSZrr :AVX512PSI, - EVEX, EVEX_V512; - let mayLoad = 1 in - def PSZrm : AVX512PSI, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_eri opc, string OpcodeStr, SDNode OpNode> { + defm PS : avx512_fp28_p, + EVEX_CD8<32, CD8VF>; + defm PD : avx512_fp28_p, + VEX_W, EVEX_CD8<32, CD8VF>; +} - def PDZrr : AVX512PDI, - EVEX, EVEX_V512; +let Predicates = [HasERI], hasSideEffects = 0 in { - let mayLoad = 1 in - def PDZrm : AVX512PDI, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX, EVEX_V512, T8PD; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX, EVEX_V512, T8PD; +} +multiclass avx512_sqrt_packed opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm r: AVX512_maskable, EVEX; + let mayLoad = 1 in { + defm m: AVX512_maskable, EVEX; + + defm mb: AVX512_maskable, + EVEX, EVEX_B; + } } multiclass avx512_sqrt_scalar opc, string OpcodeStr, @@ -4214,7 +4397,7 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XS, EVEX_4V; let mayLoad = 1 in { @@ -4228,7 +4411,7 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2))], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; } @@ -4242,7 +4425,7 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XD, EVEX_4V, VEX_W; let mayLoad = 1 in { @@ -4256,27 +4439,51 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, sdmem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2))]>, + [(set VR128X:$dst, + (F64Int VR128X:$src1, sse_load_f64:$src2))]>, XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; } } +multiclass avx512_sqrt_packed_all opc, string OpcodeStr, + SDNode OpNode> { + defm PSZ : avx512_sqrt_packed, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed, + EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_sqrt_packed, + EVEX_V128, PS, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_sqrt_packed, + EVEX_V256, PS, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_sqrt_packed, + EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_sqrt_packed, + EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>; + } +} + +defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>; -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>, - avx512_sqrt_packed<0x51, "vsqrt", fsqrt, - SSE_SQRTPS, SSE_SQRTPD>; +defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", + int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, + SSE_SQRTSS, SSE_SQRTSD>; let Predicates = [HasAVX512] in { def : Pat<(v16f32 (int_x86_avx512_sqrt_ps_512 (v16f32 VR512:$src1), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_CURRENT)), - (VSQRTPSZrr VR512:$src1)>; + (VSQRTPSZr VR512:$src1)>; def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1), (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)), - (VSQRTPDZrr VR512:$src1)>; - + (VSQRTPDZr VR512:$src1)>; + def : Pat<(f32 (fsqrt FR32X:$src)), (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -4385,7 +4592,7 @@ let ExeDomain = GenericDomain in { (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, + [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2, imm:$src3))]>, EVEX_CD8<32, CD8VT1>; @@ -4426,14 +4633,14 @@ let ExeDomain = d in { def r : AVX512AIi8, EVEX; // Vector intrinsic operation, mem def m : AVX512AIi8, EVEX; } // ExeDomain } @@ -4464,20 +4671,20 @@ let ExeDomain = d in { def r : AVX512AIi8, EVEX_4V; def m : AVX512AIi8, EVEX_4V; } // ExeDomain } defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X, SSEPackedSingle>, EVEX_CD8<32, CD8VT1>; - + defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X, SSEPackedDouble>, EVEX_CD8<64, CD8VT1>; @@ -4533,32 +4740,32 @@ multiclass avx512_trunc_sat opc, string OpcodeStr, RegisterClass KRC, X86MemOperand x86memop> { def rr : AVX512XS8I, EVEX; def rrk : AVX512XS8I, EVEX, EVEX_K; def rrkz : AVX512XS8I, EVEX, EVEX_KZ; def mr : AVX512XS8I, EVEX; def mrk : AVX512XS8I, EVEX, EVEX_K; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, +defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; @@ -4612,36 +4819,36 @@ multiclass avx512_extend opc, string OpcodeStr, RegisterClass KRC, def rr : AVX5128I, EVEX; def rrk : AVX5128I, EVEX, EVEX_K; def rrkz : AVX5128I, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I, EVEX; def rmk : AVX5128I, EVEX, EVEX_K; def rmkz : AVX5128I, EVEX, EVEX_KZ; } @@ -4689,7 +4896,7 @@ let mayLoad = 1, def rm : AVX5128I, EVEX, EVEX_K; } @@ -4706,7 +4913,7 @@ defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; } - + defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, @@ -4723,7 +4930,7 @@ let mayStore = 1, Constraints = "$mask = $mask_wb" in def mr : AVX5128I, EVEX, EVEX_K; } @@ -4756,7 +4963,7 @@ multiclass avx512_gather_scatter_prefetch opc, Format F, string OpcodeSt RegisterClass KRC, X86MemOperand memop> { let Predicates = [HasPFI], hasSideEffects = 1 in def m : AVX5128I, EVEX, EVEX_K; } @@ -4771,7 +4978,7 @@ defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; - + defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; @@ -4816,14 +5023,14 @@ multiclass avx512_shufp, EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, EVEX_4V, Sched<[WriteShuffle]>; @@ -4863,7 +5070,7 @@ multiclass avx512_valign { def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3), !strconcat("valign"##_.Suffix, - " \t{$src3, $src2, $src1, $dst|" + "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}"), []>, EVEX_4V; } @@ -4879,43 +5086,43 @@ multiclass avx512_vpabs opc, string OpcodeStr, ValueType OpVT, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { def rr : AVX5128I, EVEX; def rrk : AVX5128I, EVEX, EVEX_K; def rrkz : AVX5128I, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I, EVEX; def rmk : AVX5128I, EVEX, EVEX_K; def rmkz : AVX5128I, EVEX, EVEX_KZ; def rmb : AVX5128I, EVEX, EVEX_B; def rmbk : AVX5128I, EVEX, EVEX_B, EVEX_K; def rmbkz : AVX5128I, EVEX, EVEX_B, EVEX_KZ; @@ -4945,54 +5152,54 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPABSQZrr VR512:$src)>; -multiclass avx512_conflict opc, string OpcodeStr, +multiclass avx512_conflict opc, string OpcodeStr, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { def rr : AVX5128I, EVEX; def rm : AVX5128I, EVEX; def rmb : AVX5128I, EVEX, EVEX_B; def rrkz : AVX5128I, EVEX, EVEX_KZ; def rmkz : AVX5128I, EVEX, EVEX_KZ; def rmbkz : AVX5128I, EVEX, EVEX_KZ, EVEX_B; - + let Constraints = "$src1 = $dst" in { def rrk : AVX5128I, EVEX, EVEX_K; def rmk : AVX5128I, EVEX, EVEX_K; def rmbk : AVX5128I, EVEX, EVEX_K, EVEX_B; } @@ -5068,10 +5275,10 @@ def : Pat<(truncstorei1 GR8:$src, addr:$dst), multiclass cvt_by_vec_width opc, X86VectorVTInfo Vec, string OpcodeStr > { def rr : AVX512XS8I, EVEX; } - + multiclass cvt_mask_by_elt_width opc, AVX512VLVectorVTInfo VTInfo, string OpcodeStr, Predicate prd> { let Predicates = [prd] in @@ -5093,5 +5300,108 @@ multiclass avx512_convert_mask_to_vector { defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, HasDQI>, VEX_W; } - + defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; + +//===----------------------------------------------------------------------===// +// AVX-512 - COMPRESS and EXPAND +// +multiclass compress_by_vec_width opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I, EVEX_K; + + let mayStore = 1 in { + def mrk : AVX5128I, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + } +} + +multiclass compress_by_elt_width opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : compress_by_vec_width, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : compress_by_vec_width, EVEX_V256; + defm Z128 : compress_by_vec_width, EVEX_V128; + } +} + +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>, + EVEX; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>, + EVEX; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, + EVEX, VEX_W; + +// expand +multiclass expand_by_vec_width opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I, EVEX_K; + + let mayLoad = 1, Constraints = "$src0 = $dst" in + def rmk : AVX5128I, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + + let mayLoad = 1 in + def rmkz : AVX5128I, + EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>; + +} + +multiclass expand_by_elt_width opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : expand_by_vec_width, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : expand_by_vec_width, EVEX_V256; + defm Z128 : expand_by_vec_width, EVEX_V128; + } +} + +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>, + EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, + EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, + EVEX, VEX_W;