X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrAVX512.td;h=463ab6232702374a785d0c4047bfedd2ee108a5e;hb=dfe88a08c7f595801e733d3e6f4504d2695d5309;hp=3b2d9f631f22c82c8a5190b01886a9f6ce2c8a4e;hpb=0ae6dc9c21fa346ea6f904c43fb5c7165464ce59;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 3b2d9f631f2..463ab623270 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1,3 +1,273 @@ +// Group template arguments that can be derived from the vector type (EltNum x +// EltVT). These are things like the register class for the writemask, etc. +// The idea is to pass one of these as the template argument rather than the +// individual arguments. +// The template is also used for scalar types, in this case numelts is 1. +class X86VectorVTInfo { + RegisterClass RC = rc; + ValueType EltVT = eltvt; + int NumElts = numelts; + + // Corresponding mask register class. + RegisterClass KRC = !cast("VK" # NumElts); + + // Corresponding write-mask register class. + RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); + + // The GPR register class that can hold the write mask. Use GR8 for fewer + // than 8 elements. Use shift-right and equal to work around the lack of + // !lt in tablegen. + RegisterClass MRC = + !cast("GR" # + !if (!eq (!srl(NumElts, 3), 0), 8, NumElts)); + + // Suffix used in the instruction mnemonic. + string Suffix = suffix; + + // VTName is a string name for vector VT. For vector types it will be + // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32 + // It is a little bit complex for scalar types, where NumElts = 1. + // In this case we build v4f32 or v2f64 + string VTName = "v" # !if (!eq (NumElts, 1), + !if (!eq (EltVT.Size, 32), 4, + !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT; + + // The vector VT. + ValueType VT = !cast(VTName); + + string EltTypeName = !cast(EltVT); + // Size of the element type in bits, e.g. 32 for v16i32. + string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName)); + int EltSize = EltVT.Size; + + // "i" for integer types and "f" for floating-point types + string TypeVariantName = !subst(EltSizeName, "", EltTypeName); + + // Size of RC in bits, e.g. 512 for VR512. + int Size = VT.Size; + + // The corresponding memory operand, e.g. i512mem for VR512. + X86MemOperand MemOp = !cast(TypeVariantName # Size # "mem"); + X86MemOperand ScalarMemOp = !cast(EltVT # "mem"); + + // Load patterns + // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 + // due to load promotion during legalization + PatFrag LdFrag = !cast("load" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + VTName)), VTName)); + PatFrag ScalarLdFrag = !cast("load" # EltVT); + + // Load patterns used for memory operands. We only have this defined in + // case of i64 element types for sub-512 integer vectors. For now, keep + // MemOpFrag undefined in these cases. + PatFrag MemOpFrag = + !if (!eq (NumElts#EltTypeName, "1f32"), !cast("memopfsf32"), + !if (!eq (NumElts#EltTypeName, "1f64"), !cast("memopfsf64"), + !if (!eq (TypeVariantName, "f"), !cast("memop" # VTName), + !if (!eq (EltTypeName, "i64"), !cast("memop" # VTName), + !if (!eq (VTName, "v16i32"), !cast("memop" # VTName), ?))))); + + // The corresponding float type, e.g. v16f32 for v16i32 + // Note: For EltSize < 32, FloatVT is illegal and TableGen + // fails to compile, so we choose FloatVT = VT + ValueType FloatVT = !cast( + !if (!eq (!srl(EltSize,5),0), + VTName, + !if (!eq(TypeVariantName, "i"), + "v" # NumElts # "f" # EltSize, + VTName))); + + // The string to specify embedded broadcast in assembly. + string BroadcastStr = "{1to" # NumElts # "}"; + + // 8-bit compressed displacement tuple/subvector format. This is only + // defined for NumElts <= 8. + CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0), + !cast("CD8VT" # NumElts), ?); + + SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm, + !if (!eq (Size, 256), sub_ymm, ?)); + + Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, + !if (!eq (EltTypeName, "f64"), SSEPackedDouble, + SSEPackedInt)); + + // A vector type of the same width with element type i32. This is used to + // create the canonical constant zero node ImmAllZerosV. + ValueType i32VT = !cast("v" # !srl(Size, 5) # "i32"); + dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); +} + +def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; +def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; +def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; +def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; +def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; +def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; + +// "x" in v32i8x_info means RC = VR256X +def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">; +def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; +def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; +def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; +def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; +def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; + +def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">; +def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; +def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; +def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; +def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; +def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; + +// We map scalar types to the smallest (128-bit) vector type +// with the appropriate element type. This allows to use the same masking logic. +def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; +def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; + +class AVX512VLVectorVTInfo { + X86VectorVTInfo info512 = i512; + X86VectorVTInfo info256 = i256; + X86VectorVTInfo info128 = i128; +} + +def avx512vl_i8_info : AVX512VLVectorVTInfo; +def avx512vl_i16_info : AVX512VLVectorVTInfo; +def avx512vl_i32_info : AVX512VLVectorVTInfo; +def avx512vl_i64_info : AVX512VLVectorVTInfo; +def avx512vl_f32_info : AVX512VLVectorVTInfo; +def avx512vl_f64_info : AVX512VLVectorVTInfo; + +// This multiclass generates the masking variants from the non-masking +// variant. It only provides the assembly pieces for the masking variants. +// It assumes custom ISel patterns for masking which can be provided as +// template arguments. +multiclass AVX512_maskable_custom O, Format F, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list Pattern, + list MaskingPattern, + list ZeroMaskingPattern, + string Round = "", + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def NAME: AVX512; + + // Prefer over VMOV*rrk Pat<> + let AddedComplexity = 20 in + def NAME#k: AVX512, + EVEX_K { + // In case of the 3src subclass this is overridden with a let. + string Constraints = MaskingConstraint; + } + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def NAME#kz: AVX512, + EVEX_KZ; +} + + +// Common base class of AVX512_maskable and AVX512_maskable_3src. +multiclass AVX512_maskable_common O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + SDNode Select = vselect, string Round = "", + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_custom; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the scalar instruction. +multiclass AVX512_maskable_scalar O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common; + +// Similar to AVX512_maskable but in this case one of the source operands +// ($src1) is already tied to $dst so we just use that for the preserved +// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude +// $src1. +multiclass AVX512_maskable_3src O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common; + + +multiclass AVX512_maskable_in_asm O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list Pattern> : + AVX512_maskable_custom; + // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion let Predicates = [HasAVX512] in { @@ -17,6 +287,7 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; @@ -116,119 +387,92 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // -// -- 32x8 form -- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, f128mem:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; -} - -// -- 64x4 fp form -- -let hasSideEffects = 0, ExeDomain = SSEPackedDouble in { -def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} -// -- 32x4 integer form -- -let hasSideEffects = 0 in { -def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i128mem:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +multiclass vinsert_for_size_no_alt { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + def rr : AVX512AIi8, + EVEX_4V, EVEX_V512; + + let mayLoad = 1 in + def rm : AVX512AIi8, + EVEX_4V, EVEX_V512, EVEX_CD8; + } } -let hasSideEffects = 0 in { -// -- 64x4 form -- -def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), - (bc_v4i32 (loadv2i64 addr:$src2)), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), - (bc_v8i32 (loadv4i64 addr:$src2)), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; +multiclass vinsert_for_size : + vinsert_for_size_no_alt { + // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for + // vinserti32x4. Only add this if 64x2 and friends are not supported + // natively via AVX512DQ. + let Predicates = [NoDQI] in + def : Pat<(vinsert_insert:$ins + (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), + (AltTo.VT (!cast(NAME # From.EltSize # "x4rr") + VR512:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm VR512:$ins)))>; +} + +multiclass vinsert_for_type { + defm NAME # "32x4" : vinsert_for_size, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>; + let Predicates = [HasDQI] in + defm NAME # "64x2" : vinsert_for_size_no_alt, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>, VEX_W; + defm NAME # "64x4" : vinsert_for_size, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>, VEX_W; + let Predicates = [HasDQI] in + defm NAME # "32x8" : vinsert_for_size_no_alt, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>; +} + +defm VINSERTF : vinsert_for_type; +defm VINSERTI : vinsert_for_type; // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), + (ins VR128X:$src1, VR128X:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), + (ins VR128X:$src1, f32mem:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), @@ -237,106 +481,90 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), //===----------------------------------------------------------------------===// // AVX-512 VECTOR EXTRACT //--- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -// -- 32x4 form -- -def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs), - (ins f128mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs), - (ins f256mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} -let hasSideEffects = 0 in { -// -- 32x4 form -- -def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs), - (ins i128mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs), - (ins i256mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v4f32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)), - (v4i32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v2f64 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v2i64 (VEXTRACTI32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - - -def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v8f32 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)), - (v8i32 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v4f64 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v4i64 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -// A 256-bit subvector extract from the first 512-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; -def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; -def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; -def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; - -// zmm -> xmm -def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +multiclass vextract_for_size { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable_in_asm, + AVX512AIi8Base, EVEX, EVEX_V512; + let mayStore = 1 in + def rm : AVX512AIi8, EVEX, EVEX_V512, EVEX_CD8; + } + // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for + // vextracti32x4 + def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)), + (AltTo.VT (!cast(NAME # To.EltSize # "x4rr") + VR512:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; + + // A 128/256-bit subvector extract from the first 512-bit vector position is + // a subregister copy that needs no instruction. + def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))), + (To.VT + (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>; + + // And for the alternative types. + def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))), + (AltTo.VT + (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>; + + // Intrinsic call with masking. + def : Pat<(!cast("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask), + (!cast(NAME # To.EltSize # "x4rrk") To.RC:$src0, + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call with zero-masking. + def : Pat<(!cast("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask), + (!cast(NAME # To.EltSize # "x4rrkz") + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call without masking. + def : Pat<(!cast("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast(NAME # To.EltSize # "x4rr") + VR512:$src1, imm:$idx)>; +} + +multiclass vextract_for_type { + defm NAME # "32x4" : vextract_for_size, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract, + EXTRACT_get_vextract128_imm>; + defm NAME # "64x4" : vextract_for_size, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + vextract256_extract, + EXTRACT_get_vextract256_imm>, VEX_W; +} + +defm VEXTRACTF : vextract_for_type; +defm VEXTRACTI : vextract_for_type; // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. @@ -368,13 +596,13 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), - (ins VR128X:$src1, u32u8imm:$src2), + (ins VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, EVEX; def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), - (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2), + (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; @@ -382,105 +610,175 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), //===---------------------------------------------------------------------===// // AVX-512 BROADCAST //--- -multiclass avx512_fp_broadcast opc, string OpcodeStr, - RegisterClass DestRC, - RegisterClass SrcRC, X86MemOperand x86memop> { - def rr : AVX5128I, EVEX; - def rm : AVX5128I, EVEX; +multiclass avx512_fp_broadcast opc, SDNode OpNode, RegisterClass SrcRC, + ValueType svt, X86VectorVTInfo _> { + defm r : AVX512_maskable, + T8PD, EVEX; + + let mayLoad = 1 in { + defm m : AVX512_maskable, + T8PD, EVEX; + } } + +multiclass avx512_fp_broadcast_vl opc, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_fp_broadcast, + EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_fp_broadcast, + EVEX_V256; + } +} + let ExeDomain = SSEPackedSingle in { - defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss", VR512, - VR128X, f32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast, + avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>; + let Predicates = [HasVLX] in { + defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X, + v4f32, v4f32x_info>, EVEX_V128, + EVEX_CD8<32, CD8VT1>; + } } let ExeDomain = SSEPackedDouble in { - defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd", VR512, - VR128X, f64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast, + avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. +// Later, we can canonize broadcast instructions before ISel phase and +// eliminate additional patterns on ISel. +// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar +// representations of source +multiclass avx512_broadcast_pat { + def : Pat<(_.VT (OpNode (_.EltVT SrcRC_s:$src))), + (!cast(InstName##"r") + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + let AddedComplexity = 30 in { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)), + (!cast(InstName##"rk") _.RC:$src0, _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + def : Pat<(_.VT(vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)), + (!cast(InstName##"rkz") _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + } +} + +defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info, + VR128X, FR32X>; +defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info, + VR128X, FR64X>; + +let Predicates = [HasVLX] in { + defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast, + v8f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast, + v4f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast, + v4f64x_info, VR128X, FR64X>; } def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDZrm addr:$src)>; + (VBROADCASTSDZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), - (VBROADCASTSDZrm addr:$src)>; - -multiclass avx512_int_broadcast_reg opc, string OpcodeStr, - RegisterClass SrcRC, RegisterClass KRC> { - def Zrr : AVX5128I, EVEX, EVEX_V512; - def Zkrr : AVX5128I, EVEX, EVEX_V512, EVEX_KZ; -} - -defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; -defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, - VEX_W; - + (VBROADCASTSDZm addr:$src)>; + +multiclass avx512_int_broadcast_reg opc, X86VectorVTInfo _, + RegisterClass SrcRC> { + defm r : AVX512_maskable_in_asm, T8PD, EVEX; +} + +multiclass avx512_int_broadcast_reg_vl opc, AVX512VLVectorVTInfo _, + RegisterClass SrcRC, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_int_broadcast_reg, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_int_broadcast_reg, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg, EVEX_V128; + } +} + +defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32, + HasBWI>; +defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32, + HasBWI>; +defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32, + HasAVX512>; +defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64, + HasAVX512>, VEX_W; + def : Pat <(v16i32 (X86vzext VK16WM:$mask)), - (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>; def : Pat <(v8i64 (X86vzext VK8WM:$mask)), - (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>; def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))), - (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>; + (VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>; def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))), - (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>; + (VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src), (v16i32 immAllZerosV), (i16 GR16:$mask))), - (VPBROADCASTDrZkrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; + (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), - (VPBROADCASTQrZkrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; + (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; multiclass avx512_int_broadcast_rm opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, RegisterClass KRC> { def rr : AVX5128I, EVEX; def krr : AVX5128I, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I, EVEX; def krm : AVX5128I, EVEX, EVEX_KZ; } } @@ -497,12 +795,12 @@ multiclass avx512_int_subvec_broadcast_rm opc, string OpcodeStr, RegisterClass KRC> { let mayLoad = 1 in { def rm : AVX5128I, EVEX; def krm : AVX5128I, EVEX, EVEX_KZ; } } @@ -519,27 +817,32 @@ def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), (VPBROADCASTQZrr VR128X:$src)>; -def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; -def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; + +def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), + (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), + (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; + (VBROADCASTSSZr VR128X:$src)>; def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; - + (VBROADCASTSDZr VR128X:$src)>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), - (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), - (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; let Predicates = [HasAVX512] in { def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), addr:$src)), sub_ymm)>; } @@ -548,64 +851,107 @@ def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), //--- multiclass avx512_mask_broadcast opc, string OpcodeStr, - RegisterClass DstRC, RegisterClass KRC, - ValueType OpVT, ValueType SrcVT> { -def rr : AVX512XS8I, EVEX; + RegisterClass KRC> { +let Predicates = [HasCDI] in +def Zrr : AVX512XS8I, EVEX, EVEX_V512; + +let Predicates = [HasCDI, HasVLX] in { +def Z128rr : AVX512XS8I, EVEX, EVEX_V128; +def Z256rr : AVX512XS8I, EVEX, EVEX_V256; +} } let Predicates = [HasCDI] in { -defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, - VK16, v16i32, v16i1>, EVEX_V512; -defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, - VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", + VK16>; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", + VK8>, VEX_W; } //===----------------------------------------------------------------------===// // AVX-512 - VPERM // // -- immediate form -- -multiclass avx512_perm_imm opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT> { - def ri : AVX512AIi8 opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def ri : AVX512AIi8, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, EVEX; - def mi : AVX512AIi8, EVEX; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (OpNode (_.MemOpFrag addr:$src1), + (i8 imm:$src2))))]>, + EVEX, EVEX_CD8<_.EltSize, CD8VF>; +} +} + +multiclass avx512_permil OpcImm, bits<8> OpcVar, X86VectorVTInfo _, + X86VectorVTInfo Ctrl> : + avx512_perm_imm { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I, + EVEX_4V; + def rm : AVX5128I, + EVEX_4V; + } } -defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64, - i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, - f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", X86VPermi, v8i64_info>, + EVEX_V512, VEX_W; +defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", X86VPermi, v8f64_info>, + EVEX_V512, VEX_W; + +defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, + EVEX_V512; +defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, + EVEX_V512, VEX_W; + +def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPSZri VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPDZri VR512:$src1, imm:$imm)>; // -- VPERM - register form -- -multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, +multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def rr : AVX5128I, EVEX_4V; def rm : AVX5128I, EVEX_4V; @@ -613,13 +959,13 @@ multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let ExeDomain = SSEPackedSingle in defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; // -- VPERM2I - 3 source operands form -- @@ -630,7 +976,7 @@ let Constraints = "$src1 = $dst" in { def rr : AVX5128I, EVEX_4V; @@ -638,7 +984,7 @@ let Constraints = "$src1 = $dst" in { def rrk : AVX5128I, EVEX_4V; @@ -670,7 +1016,7 @@ let Constraints = "$src1 = $dst" in { def rmk : AVX5128I opc, string OpcodeStr, - RegisterClass KRC, RegisterClass RC, - X86MemOperand x86memop, PatFrag mem_frag, - SDNode OpNode, ValueType vt> { - def rr : AVX5128I opc, string OpcodeStr, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I, EVEX_4V, EVEX_K; - let mayLoad = 1 in - def rm : AVX5128I, EVEX_4V; + def rrk : AVX5128I, EVEX_4V, EVEX_K; + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K; + def rrkz : AVX5128I, EVEX_4V, EVEX_KZ; + let mayLoad = 1 in { + def rm : AVX5128I, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + def rmk : AVX5128I, + EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + def rmkz : AVX5128I, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + } + } } +multiclass avx512_blendmask_rmb opc, string OpcodeStr, X86VectorVTInfo _> { + + def rmbk : AVX5128I, + EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + + def rmb : AVX5128I, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + +} + +multiclass blendmask_dq opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : avx512_blendmask , + avx512_blendmask_rmb , EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_blendmask, + avx512_blendmask_rmb , EVEX_V256; + defm Z128 : avx512_blendmask, + avx512_blendmask_rmb , EVEX_V128; + } +} + +multiclass blendmask_bw opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm Z : avx512_blendmask , EVEX_V512; + + let Predicates = [HasBWI, HasVLX] in { + defm Z256 : avx512_blendmask , EVEX_V256; + defm Z128 : avx512_blendmask , EVEX_V128; + } +} + + +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; -let ExeDomain = SSEPackedSingle in -defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", - VK16WM, VR512, f512mem, - memopv16f32, vselect, v16f32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; -let ExeDomain = SSEPackedDouble in -defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", - VK8WM, VR512, f512mem, - memopv8f64, vselect, v8f64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (i16 GR16:$mask))), - (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (i8 GR8:$mask))), - (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), - VR512:$src1, VR512:$src2)>; - -defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", - VK16WM, VR512, f512mem, - memopv16i32, vselect, v16i32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; - -defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", - VK8WM, VR512, f512mem, - memopv8i64, vselect, v8i64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (i16 GR16:$mask))), - (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (i8 GR8:$mask))), - (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8), - VR512:$src1, VR512:$src2)>; let Predicates = [HasAVX512] in { def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), (v8f32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; } @@ -850,98 +1229,295 @@ defm VCMPSDZ : avx512_cmp_scalar opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt> { +multiclass avx512_icmp_packed opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { def rr : AVX512BI, EVEX_4V; + let mayLoad = 1 in def rm : AVX512BI, EVEX_4V; + def rrk : AVX512BI, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmk : AVX512BI, EVEX_4V, EVEX_K; +} + +multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_packed { + let mayLoad = 1 in { + def rmb : AVX512BI, EVEX_4V, EVEX_B; + def rmbk : AVX512BI, EVEX_4V, EVEX_K, EVEX_B; + } } -defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, - memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, - memopv8i64, X86pcmpeqm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_packed_vl opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed, + EVEX_V512; -defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, - memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, - memopv8i64, X86pcmpgtm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed, + EVEX_V256; + defm Z128 : avx512_icmp_packed, + EVEX_V128; + } +} + +multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed_rmb, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed_rmb, + EVEX_V256; + defm Z128 : avx512_icmp_packed_rmb, + EVEX_V128; + } +} + +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPGTDZrr + (COPY_TO_REGCLASS (VPCMPGTDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPEQDZrr + (COPY_TO_REGCLASS (VPCMPEQDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; -multiclass avx512_icmp_cc opc, RegisterClass WMRC, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt, Operand CC, string Suffix> { +multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> { def rri : AVX512AIi8, EVEX_4V; + let mayLoad = 1 in def rmi : AVX512AIi8, EVEX_4V; + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrik : AVX512AIi8, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmik : AVX512AIi8, EVEX_4V, EVEX_K; + // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8, EVEX_4V; + def rmi_alt : AVX512AIi8, EVEX_4V; def rrik_alt : AVX512AIi8, EVEX_4V, EVEX_K; - def rmi_alt : AVX512AIi8, EVEX_4V; def rmik_alt : AVX512AIi8, EVEX_4V, EVEX_K; } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpm, v16i32, AVXCC, "d">, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpmu, v16i32, AVXCC, "ud">, - EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_cc { + let mayLoad = 1 in { + def rmib : AVX512AIi8, EVEX_4V, EVEX_B; + def rmibk : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B; + } + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rmib_alt : AVX512AIi8, EVEX_4V, EVEX_B; + def rmibk_alt : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B; + } +} + +multiclass avx512_icmp_cc_vl opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc, EVEX_V256; + defm Z128 : avx512_icmp_cc, EVEX_V128; + } +} + +multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc_rmb, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc_rmb, + EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb, + EVEX_V128; + } +} + +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpm, v8i64, AVXCC, "q">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpmu, v8i64, AVXCC, "uq">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; // avx512_cmp_packed - compare packed instructions multiclass avx512_cmp_packed; def rrib: AVX512PIi8<0xC2, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), + "\t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), [], d>, EVEX_B; def rmi : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set KRC:$dst, (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>; @@ -969,11 +1545,11 @@ multiclass avx512_cmp_packed; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), !strconcat("vcmp", suffix, - " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; } } @@ -1001,25 +1577,25 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), imm:$cc), VK8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; - + def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; @@ -1034,14 +1610,14 @@ multiclass avx512_mask_mov opc_kk, bits<8> opc_km, bits<8> opc_mk, ValueType vvt, ValueType ivt, X86MemOperand x86memop> { let hasSideEffects = 0 in { def kk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; let mayLoad = 1 in def km : I; let mayStore = 1 in def mk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } } @@ -1050,9 +1626,9 @@ multiclass avx512_mask_mov_gpr opc_kr, bits<8> opc_rk, RegisterClass KRC, RegisterClass GRC> { let hasSideEffects = 0 in { def kr : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; def rk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } } @@ -1129,6 +1705,10 @@ let Predicates = [HasBWI] in { } let Predicates = [HasAVX512] in { + def : Pat<(i1 (trunc (i64 GR64:$src))), + (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit), + (i32 1))), VK1)>; + def : Pat<(i1 (trunc (i32 GR32:$src))), (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>; @@ -1198,7 +1778,7 @@ multiclass avx512_mask_unop opc, string OpcodeStr, Predicate prd> { let Predicates = [prd] in def rr : I; } @@ -1252,7 +1832,7 @@ multiclass avx512_mask_binop opc, string OpcodeStr, let Predicates = [prd] in def rr : I; } @@ -1328,7 +1908,7 @@ multiclass avx512_mask_unpck opc, string OpcodeStr, let Predicates = [HasAVX512] in def rr : I; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; } multiclass avx512_mask_unpck_bw opc, string OpcodeStr> { @@ -1357,7 +1937,7 @@ multiclass avx512_mask_testop opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode> { let Predicates = [HasAVX512], Defs = [EFLAGS] in def rr : I; } @@ -1378,7 +1958,7 @@ multiclass avx512_mask_shiftop opc, string OpcodeStr, RegisterClass KRC, let Predicates = [HasAVX512] in def ri : Ii8; } @@ -1424,6 +2004,17 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +let Predicates = [HasVLX] in { + def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; +} + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; @@ -1522,7 +2113,7 @@ multiclass avx512_load_vl opc, string OpcodeStr, string ld_pat, multiclass avx512_store opc, string OpcodeStr, PatFrag st_frag, ValueType OpVT, RegisterClass KRC, RegisterClass RC, X86MemOperand memop, Domain d> { - let isAsmParserOnly = 1, hasSideEffects = 0 in { + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_alt : AVX512PI, EVEX; @@ -1618,6 +2209,46 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), + (VMOVUPSZmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), + (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), + (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, + (bc_v16f32 (v16i32 immAllZerosV)))), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), + (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8f64 (v16i32 immAllZerosV)))), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), + (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", "16", "8", "4", SSEPackedInt, HasAVX512>, avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", @@ -1692,6 +2323,46 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), + (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), + (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), + (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), + (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +// SKX replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; + +// KNL replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Zmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + + // Move Int Doubleword to Packed Double Int // def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), @@ -1798,12 +2469,12 @@ def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar { let hasSideEffects = 0 in { - def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), - !strconcat(asm, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128X:$dst, (vt (OpNode VR128X:$src1, (scalar_to_vector RC:$src2))))], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; @@ -1811,16 +2482,22 @@ multiclass avx512_move_scalar , EVEX_4V, VEX_LIG, EVEX_K; def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, EVEX, VEX_LIG; + let mayStore = 1 in { def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG; + def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], IIC_SSE_MOV_S_MR>, + EVEX, VEX_LIG, EVEX_K; + } // mayStore } //hasSideEffects = 0 } @@ -1840,6 +2517,10 @@ def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; +def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + // For the disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), @@ -1870,7 +2551,7 @@ let Predicates = [HasAVX512] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (V_SET0)), (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), @@ -1999,7 +2680,7 @@ let AddedComplexity = 15 in def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128X:$dst, (v2i64 (X86vzmovl + [(set VR128X:$dst, (v2i64 (X86vzmovl (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; @@ -2021,7 +2702,7 @@ let Predicates = [HasAVX512] in { (VMOV64toPQIZrr GR64:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIZrr GR32:$src)>; - + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), @@ -2058,136 +2739,201 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// +let SchedRW = [WriteLoad] in { + def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))], + SSEPackedInt>, EVEX, T8PD, EVEX_V512, + EVEX_CD8<64, CD8VF>; + + let Predicates = [HasAVX512, HasVLX] in { + def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), + (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V256, + EVEX_CD8<64, CD8VF>; + + def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V128, + EVEX_CD8<64, CD8VF>; + } +} + +multiclass avx512_movnt opc, string OpcodeStr, PatFrag st_frag, + ValueType OpVT, RegisterClass RC, X86MemOperand memop, + Domain d, InstrItinClass itin = IIC_SSE_MOVNT> { + let SchedRW = [WriteStore], mayStore = 1, + AddedComplexity = 400 in + def mr : AVX512PI, EVEX; +} -def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst), - (ins i512mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR512:$dst, - (int_x86_avx512_movntdqa addr:$src))]>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; - -// Prefer non-temporal over temporal versions -let AddedComplexity = 400, SchedRW = [WriteStore] in { - -def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v16f32 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; - -def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - - -def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs), - (ins i512mem:$dst, VR512:$src), - "vmovntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8i64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; +multiclass avx512_movnt_vl opc, string OpcodeStr, PatFrag st_frag, + string elty, string elsz, string vsz512, + string vsz256, string vsz128, Domain d, + Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> { + let Predicates = [prd] in + defm Z : avx512_movnt("v"##vsz512##elty##elsz), VR512, + !cast(elty##"512mem"), d, itin>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_movnt("v"##vsz256##elty##elsz), VR256X, + !cast(elty##"256mem"), d, itin>, + EVEX_V256; + + defm Z128 : avx512_movnt("v"##vsz128##elty##elsz), VR128X, + !cast(elty##"128mem"), d, itin>, + EVEX_V128; + } } +defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore, + "i", "64", "8", "4", "2", SSEPackedInt, + HasAVX512>, PD, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore, + "f", "64", "8", "4", "2", SSEPackedDouble, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore, + "f", "32", "16", "8", "4", SSEPackedSingle, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 - Integer arithmetic // multiclass avx512_binop_rm opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass KRC, - RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, PatFrag scalar_mfrag, - X86MemOperand x86scalar_mop, string BrdcstStr, - OpndItins itins, bit IsCommutable = 0> { - let isCommutable = IsCommutable in - def rr : AVX512BI, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rrk : AVX512BI, EVEX_4V, EVEX_K; - def rrkz : AVX512BI, EVEX_4V, EVEX_KZ; + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> { + defm rr : AVX512_maskable, + AVX512BIBase, EVEX_4V; + + let mayLoad = 1 in + defm rm : AVX512_maskable, + AVX512BIBase, EVEX_4V; +} + +multiclass avx512_binop_rmb opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> : + avx512_binop_rm { + let mayLoad = 1 in + defm rmb : AVX512_maskable, + AVX512BIBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_binop_rm_vl opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rm, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rm, EVEX_V256; + defm Z128 : avx512_binop_rm, EVEX_V128; } +} - let mayLoad = 1 in { - def rm : AVX512BI, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmk : AVX512BI, EVEX_4V, EVEX_K; - def rmkz : AVX512BI, EVEX_4V, EVEX_KZ; - } - def rmb : AVX512BI, EVEX_4V, EVEX_B; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmbk : AVX512BI, EVEX_4V, EVEX_B, EVEX_K; - def rmbkz : AVX512BI, EVEX_4V, EVEX_B, EVEX_KZ; - } +multiclass avx512_binop_rmb_vl opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rmb, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rmb, EVEX_V256; + defm Z128 : avx512_binop_rmb, EVEX_V128; } } +multiclass avx512_binop_rm_vl_q opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl, + VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_binop_rm_vl_d opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl, EVEX_CD8<32, CD8VF>; +} + +multiclass avx512_binop_rm_vl_w opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl, EVEX_CD8<16, CD8VF>; +} + +multiclass avx512_binop_rm_vl_b opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl, EVEX_CD8<8, CD8VF>; +} + +multiclass avx512_binop_rm_vl_dq opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm Q : avx512_binop_rm_vl_q; + + defm D : avx512_binop_rm_vl_d; +} + +multiclass avx512_binop_rm_vl_bw opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm W : avx512_binop_rm_vl_w; + + defm B : avx512_binop_rm_vl_b; +} + +multiclass avx512_binop_rm_vl_all opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode, + OpndItins itins, bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl_dq, + avx512_binop_rm_vl_bw; +} + multiclass avx512_binop_rm2 opc, string OpcodeStr, ValueType DstVT, ValueType SrcVT, RegisterClass KRC, RegisterClass RC, PatFrag memop_frag, X86MemOperand x86memop, @@ -2197,73 +2943,64 @@ multiclass avx512_binop_rm2 opc, string OpcodeStr, ValueType DstVT, { def rr : AVX512BI, EVEX_4V; def rrk : AVX512BI, EVEX_4V, EVEX_K; def rrkz : AVX512BI, EVEX_4V, EVEX_KZ; } let mayLoad = 1 in { def rm : AVX512BI, EVEX_4V; def rmk : AVX512BI, EVEX_4V, EVEX_K; def rmkz : AVX512BI, EVEX_4V, EVEX_KZ; def rmb : AVX512BI, EVEX_4V, EVEX_B; def rmbk : AVX512BI, EVEX_4V, EVEX_B, EVEX_K; def rmbkz : AVX512BI, EVEX_4V, EVEX_B, EVEX_KZ; } } -defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 1>, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W; - -defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, + SSE_INTALU_ITINS_P, 1>; +defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, + SSE_INTALU_ITINS_P, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, memopv8i64, i512mem, loadi64, i64mem, "{1to8}", @@ -2284,41 +3021,33 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPMULDQZrr VR512:$src1, VR512:$src2)>; -defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), @@ -2383,12 +3112,12 @@ multiclass avx512_unpack_int opc, string OpcodeStr, SDNode OpNode, X86MemOperand x86memop> { def rr : AVX512BI, EVEX_4V; def rm : AVX512BI, EVEX_4V; @@ -2410,19 +3139,19 @@ defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, // multiclass avx512_pshuf_imm opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, + SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def ri : AVX512Ii8, EVEX; def mi : AVX512Ii8, EVEX; @@ -2431,48 +3160,18 @@ multiclass avx512_pshuf_imm opc, string OpcodeStr, RegisterClass RC, defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedSingle in -defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp, - memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512, - EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp, - memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512, - VEX_W, EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPORDZ : avx512_binop_rm<0xEB, "vpord", or, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPORQZ : avx512_binop_rm<0xEB, "vporq", or, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SSE_INTALU_ITINS_P, HasAVX512, 1>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic @@ -2500,118 +3199,58 @@ defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; } multiclass avx512_fp_packed opc, string OpcodeStr, SDNode OpNode, - RegisterClass KRC, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag, - X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, - Domain d, OpndItins itins, bit commutable> { - let isCommutable = commutable in { - def rr : PI, - EVEX_4V; - - def rrk: PI, EVEX_4V, EVEX_K; - - def rrkz: PI, EVEX_4V, EVEX_KZ; - } - + X86VectorVTInfo _, bit IsCommutable> { + defm rr: AVX512_maskable, EVEX_4V; let mayLoad = 1 in { - def rm : PI, EVEX_4V; - - def rmb : PI, EVEX_4V, EVEX_B; - - def rmk : PI, EVEX_4V, EVEX_K; - - def rmkz : PI, EVEX_4V, EVEX_KZ; - - def rmbk : PI, EVEX_4V, EVEX_B, EVEX_K; - - def rmbkz : PI, EVEX_4V, EVEX_B, EVEX_KZ; + defm rm: AVX512_maskable, EVEX_4V; + defm rmb: AVX512_maskable, + EVEX_4V, EVEX_B; + }//let mayLoad = 1 +} + +multiclass avx512_fp_binop_p opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable = 0> { + defm PSZ : avx512_fp_packed, EVEX_V512, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_packed, EVEX_V512, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_packed, EVEX_V128, PS, + EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_packed, EVEX_V256, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_packed, EVEX_V128, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_packed, EVEX_V256, PD, VEX_W, + EVEX_CD8<64, CD8VF>; } } -defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>; def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), @@ -2636,18 +3275,18 @@ def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, +multiclass avx512_vptest opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, SDNode OpNode, ValueType vt> { def rr : AVX512PI, EVEX_4V; def rm : AVX512PI, EVEX_4V; } @@ -2674,154 +3313,122 @@ def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1), def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>; + //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, RegisterClass RC, - ValueType vt, X86MemOperand x86memop, PatFrag mem_frag, - RegisterClass KRC> { - def ri : AVX512BIi8, EVEX_4V; - def rik : AVX512BIi8, EVEX_4V, EVEX_K; - def mi: AVX512BIi8, EVEX_4V; - def mik: AVX512BIi8, EVEX_4V, EVEX_K; + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + defm ri : AVX512_maskable, AVX512BIi8Base, EVEX_4V; + defm mi : AVX512_maskable, AVX512BIi8Base, EVEX_4V; } multiclass avx512_shift_rrm opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, ValueType SrcVT, - PatFrag bc_frag, RegisterClass KRC> { - // src2 is always 128-bit - def rr : AVX512BI, EVEX_4V; - def rrk : AVX512BI, EVEX_4V, EVEX_K; - def rm : AVX512BI, EVEX_4V; - def rmk : AVX512BI, EVEX_4V, EVEX_K; + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + // src2 is always 128-bit + defm rr : AVX512_maskable, AVX512BIBase, EVEX_4V; + defm rm : AVX512_maskable, AVX512BIBase, EVEX_4V; +} + +multiclass avx512_shift_sizes opc, string OpcodeStr, SDNode OpNode, + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + defm Z : avx512_shift_rrm, EVEX_V512; +} + +multiclass avx512_shift_types opcd, bits<8> opcq, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_shift_sizes, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_shift_sizes, EVEX_CD8<64, CD8VQ>, VEX_W; } defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// // Variable Bit Shifts //===-------------------------------------------------------------------===// multiclass avx512_var_shift opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag> { - def rr : AVX5128I, - EVEX_4V; - def rm : AVX5128I, - EVEX_4V; + X86VectorVTInfo _> { + defm rr : AVX512_maskable, AVX5128IBase, EVEX_4V; + defm rm : AVX512_maskable, AVX5128IBase, EVEX_4V; } -defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; +multiclass avx512_var_shift_sizes opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_var_shift, EVEX_V512; +} + +multiclass avx512_var_shift_types opc, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_var_shift_sizes, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_var_shift_sizes, EVEX_CD8<64, CD8VQ>, VEX_W; +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup { def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; } @@ -2838,11 +3445,11 @@ multiclass avx512_replicate_sfp op, SDNode OpNode, string OpcodeStr, ValueType vt, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop> { def rr : AVX512XSI, EVEX; let mayLoad = 1 in def rm : AVX512XSI, EVEX; } @@ -2889,175 +3496,139 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // + let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_rm opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT> { - def r: AVX512FMA3; +// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. +multiclass avx512_fma3p_rm opc, string OpcodeStr, X86VectorVTInfo _, + SDPatternOperator OpNode = null_frag> { + defm r: AVX512_maskable_3src, + AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3; - def mb: AVX512FMA3, EVEX_B; -} + defm m: AVX512_maskable_3src, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src, + AVX512FMA3Base, EVEX_B; + } } // Constraints = "$src1 = $dst" -let ExeDomain = SSEPackedSingle in { - defm VFMADD213PSZ : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB213PSZ : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD213PSZ : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB213PSZ : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; +multiclass avx512_fma3p_forms opc213, bits<8> opc231, + string OpcodeStr, X86VectorVTInfo VTI, + SDPatternOperator OpNode> { + defm v213r : avx512_fma3p_rm, EVEX_CD8; + + defm v231r : avx512_fma3p_rm, EVEX_CD8; } + +multiclass avx512_fma3p opc213, bits<8> opc231, + string OpcodeStr, + SDPatternOperator OpNode> { +let ExeDomain = SSEPackedSingle in { + defm NAME##PSZ : avx512_fma3p_forms, EVEX_V512; + defm NAME##PSZ256 : avx512_fma3p_forms, EVEX_V256; + defm NAME##PSZ128 : avx512_fma3p_forms, EVEX_V128; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD213PDZ : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB213PDZ : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_forms, EVEX_V512, VEX_W; + defm NAME##PDZ256 : avx512_fma3p_forms, EVEX_V256, VEX_W; + defm NAME##PDZ128 : avx512_fma3p_forms, EVEX_V128, VEX_W; + } } +defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd>; +defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub>; +defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub>; +defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd>; +defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd>; +defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub>; + let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_m132 opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT> { +multiclass avx512_fma3p_m132 opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { let mayLoad = 1 in - def m: AVX512FMA3; - def mb: AVX512FMA3, EVEX_B; + def m: AVX512FMA3; + def mb: AVX512FMA3, EVEX_B; } } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_m132_f opc, + string OpcodeStr, + SDNode OpNode> { + let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -} + defm NAME##PSZ : avx512_fma3p_m132, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ256 : avx512_fma3p_m132, EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ128 : avx512_fma3p_m132, EVEX_V128, EVEX_CD8<32, CD8VF>; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_m132, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ256 : avx512_fma3p_m132, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ128 : avx512_fma3p_m132, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>; + } } +defm VFMADD132 : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>; +defm VFMSUB132 : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>; +defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>; +defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; +defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; +defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; + + // Scalar FMA let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3s_rm opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, +multiclass avx512_fma3s_rm opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType OpVT, + X86MemOperand x86memop, Operand memop, PatFrag mem_frag> { let isCommutable = 1 in def r : AVX512FMA3; let mayLoad = 1 in def m : AVX512FMA3; @@ -3090,12 +3661,12 @@ multiclass avx512_vcvtsi opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { let hasSideEffects = 0 in { def rr : SI, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; let mayLoad = 1 in def rm : SI, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; } // hasSideEffects = 0 } @@ -3163,12 +3734,12 @@ multiclass avx512_cvt_s_int opc, RegisterClass SrcRC, RegisterClass DstR string asm> { let hasSideEffects = 0 in { def rr : SI, EVEX, VEX_LIG, Requires<[HasAVX512]>; let mayLoad = 1 in def rm : SI, EVEX, VEX_LIG, + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, Requires<[HasAVX512]>; } // hasSideEffects = 0 } @@ -3266,10 +3837,10 @@ multiclass avx512_cvt_s opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm> { def rr : SI, EVEX; def rm : SI, EVEX; } @@ -3342,21 +3913,21 @@ def : Pat<(extloadf32 addr:$src), def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, Requires<[HasAVX512]>; -multiclass avx512_vcvt_fp_with_rc opc, string asm, RegisterClass SrcRC, - RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, +multiclass avx512_vcvt_fp_with_rc opc, string asm, RegisterClass SrcRC, + RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT, ValueType InVT, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; def rrb : AVX512PI, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI, EVEX; } // hasSideEffects = 0 @@ -3368,12 +3939,12 @@ multiclass avx512_vcvt_fp opc, string asm, RegisterClass SrcRC, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; let mayLoad = 1 in def rm : AVX512PI, EVEX; } // hasSideEffects = 0 @@ -3390,7 +3961,7 @@ defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; - + def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))), (VCVTPD2PSZrr VR512:$src)>; @@ -3419,7 +3990,7 @@ defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, - memopv8f64, f512mem, v8i32, v8f64, + memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3437,7 +4008,7 @@ defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uin memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PS, VEX_W, EVEX_CD8<64, CD8VF>; - + // cvttpd2udq (src, 0, mask-all-ones, sae-current) def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src), (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)), @@ -3447,16 +4018,16 @@ defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, memopv4i64, f256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; - + defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, memopv16i32, f512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, XD, EVEX_CD8<32, CD8VF>; def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), - (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3464,7 +4035,7 @@ def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3491,14 +4062,14 @@ multiclass avx512_vcvt_fp2int opc, string asm, RegisterClass SrcRC, X86MemOperand x86memop, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; def rrb : AVX512PI, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI, EVEX; } // hasSideEffects = 0 } @@ -3557,12 +4128,12 @@ multiclass avx512_cvtps2ph { def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), (ins srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; let hasSideEffects = 0, mayStore = 1 in def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; } defm VCVTPH2PSZ : avx512_cvtph2ps, EVEX_V512, @@ -3609,7 +4180,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } - + /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop> { @@ -3617,12 +4188,12 @@ multiclass avx512_fp14_s opc, string OpcodeStr, RegisterClass RC, def rr : AVX5128I, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; let mayLoad = 1 in { def rm : AVX5128I, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; } } } @@ -3658,26 +4229,49 @@ def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1), /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, ValueType OpVt> { - def r : AVX5128I, - EVEX; - def m : AVX5128I, - EVEX; -} -defm VRSQRT14PSZ : avx512_fp14_p<0x4E, "vrsqrt14ps", X86frsqrt, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT14PDZ : avx512_fp14_p<0x4E, "vrsqrt14pd", X86frsqrt, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP14PSZ : avx512_fp14_p<0x4C, "vrcp14ps", X86frcp, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP14PDZ : avx512_fp14_p<0x4C, "vrcp14pd", X86frcp, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + X86VectorVTInfo _> { + defm r: AVX512_maskable, EVEX, T8PD; + let mayLoad = 1 in { + defm m: AVX512_maskable, EVEX, T8PD; + defm mb: AVX512_maskable, + EVEX, T8PD, EVEX_B; + } +} + +multiclass avx512_fp14_p_vl_all opc, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp14_p, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp14_p, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp14_p, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp14_p, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp14_p, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp14_p, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} + +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), @@ -3694,126 +4288,100 @@ def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), (VRCP14PDZr VR512:$src)>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd -multiclass avx512_fp28_s opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def rr : AVX5128I, EVEX_4V; - def rrb : AVX5128I, EVEX_4V, EVEX_B; - let mayLoad = 1 in { - def rm : AVX5128I, EVEX_4V; - } +multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode> { + + defm r : AVX512_maskable_scalar; + + defm rb : AVX512_maskable_scalar, EVEX_B; + + defm m : AVX512_maskable_scalar; } + +multiclass avx512_eri_s opc, string OpcodeStr, SDNode OpNode> { + defm SS : avx512_fp28_s, + EVEX_CD8<32, CD8VT1>; + defm SD : avx512_fp28_s, + EVEX_CD8<64, CD8VT1>, VEX_W; } -defm VRCP28SS : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRCP28SD : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; -defm VRSQRT28SS : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRSQRT28SD : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; +let hasSideEffects = 0, Predicates = [HasERI] in { + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; +} +/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd -def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; +multiclass avx512_fp28_p opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode> { -def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; + defm r : AVX512_maskable; -def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; + defm rb : AVX512_maskable, EVEX_B; -def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; + defm m : AVX512_maskable; -/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd -multiclass avx512_fp28_p opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def r : AVX5128I, EVEX; - def rb : AVX5128I, EVEX, EVEX_B; - def m : AVX5128I, EVEX; - } + defm mb : AVX512_maskable, EVEX_B; } -defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRSQRT28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRSQRT28PDZrb VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRCP28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRCP28PDZrb VR512:$src)>; - -multiclass avx512_sqrt_packed opc, string OpcodeStr, SDNode OpNode, - OpndItins itins_s, OpndItins itins_d> { - def PSZrr :AVX512PSI, - EVEX, EVEX_V512; - let mayLoad = 1 in - def PSZrm : AVX512PSI, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_eri opc, string OpcodeStr, SDNode OpNode> { + defm PS : avx512_fp28_p, + EVEX_CD8<32, CD8VF>; + defm PD : avx512_fp28_p, + VEX_W, EVEX_CD8<32, CD8VF>; +} - def PDZrr : AVX512PDI, - EVEX, EVEX_V512; +let Predicates = [HasERI], hasSideEffects = 0 in { - let mayLoad = 1 in - def PDZrm : AVX512PDI, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX, EVEX_V512, T8PD; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX, EVEX_V512, T8PD; +} +multiclass avx512_sqrt_packed opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm r: AVX512_maskable, EVEX; + let mayLoad = 1 in { + defm m: AVX512_maskable, EVEX; + + defm mb: AVX512_maskable, + EVEX, EVEX_B; + } } multiclass avx512_sqrt_scalar opc, string OpcodeStr, @@ -3829,7 +4397,7 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XS, EVEX_4V; let mayLoad = 1 in { @@ -3843,7 +4411,7 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2))], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; } @@ -3857,7 +4425,7 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XD, EVEX_4V, VEX_W; let mayLoad = 1 in { @@ -3871,27 +4439,51 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, (ins VR128X:$src1, sdmem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2))]>, + [(set VR128X:$dst, + (F64Int VR128X:$src1, sse_load_f64:$src2))]>, XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; } } +multiclass avx512_sqrt_packed_all opc, string OpcodeStr, + SDNode OpNode> { + defm PSZ : avx512_sqrt_packed, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed, + EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_sqrt_packed, + EVEX_V128, PS, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_sqrt_packed, + EVEX_V256, PS, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_sqrt_packed, + EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_sqrt_packed, + EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>; + } +} -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>, - avx512_sqrt_packed<0x51, "vsqrt", fsqrt, - SSE_SQRTPS, SSE_SQRTPD>; +defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>; + +defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", + int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, + SSE_SQRTSS, SSE_SQRTSD>; let Predicates = [HasAVX512] in { def : Pat<(v16f32 (int_x86_avx512_sqrt_ps_512 (v16f32 VR512:$src1), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_CURRENT)), - (VSQRTPSZrr VR512:$src1)>; + (VSQRTPSZr VR512:$src1)>; def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1), (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)), - (VSQRTPDZrr VR512:$src1)>; - + (VSQRTPDZr VR512:$src1)>; + def : Pat<(f32 (fsqrt FR32X:$src)), (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -4000,7 +4592,7 @@ let ExeDomain = GenericDomain in { (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, + [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2, imm:$src3))]>, EVEX_CD8<32, CD8VT1>; @@ -4041,14 +4633,14 @@ let ExeDomain = d in { def r : AVX512AIi8, EVEX; // Vector intrinsic operation, mem def m : AVX512AIi8, EVEX; } // ExeDomain } @@ -4079,20 +4671,20 @@ let ExeDomain = d in { def r : AVX512AIi8, EVEX_4V; def m : AVX512AIi8, EVEX_4V; } // ExeDomain } defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X, SSEPackedSingle>, EVEX_CD8<32, CD8VT1>; - + defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X, SSEPackedDouble>, EVEX_CD8<64, CD8VT1>; @@ -4148,32 +4740,32 @@ multiclass avx512_trunc_sat opc, string OpcodeStr, RegisterClass KRC, X86MemOperand x86memop> { def rr : AVX512XS8I, EVEX; def rrk : AVX512XS8I, EVEX, EVEX_K; def rrkz : AVX512XS8I, EVEX, EVEX_KZ; def mr : AVX512XS8I, EVEX; def mrk : AVX512XS8I, EVEX, EVEX_K; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, +defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; @@ -4227,36 +4819,36 @@ multiclass avx512_extend opc, string OpcodeStr, RegisterClass KRC, def rr : AVX5128I, EVEX; def rrk : AVX5128I, EVEX, EVEX_K; def rrkz : AVX5128I, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I, EVEX; def rmk : AVX5128I, EVEX, EVEX_K; def rmkz : AVX5128I, EVEX, EVEX_KZ; } @@ -4304,7 +4896,7 @@ let mayLoad = 1, def rm : AVX5128I, EVEX, EVEX_K; } @@ -4321,7 +4913,7 @@ defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; } - + defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, @@ -4338,7 +4930,7 @@ let mayStore = 1, Constraints = "$mask = $mask_wb" in def mr : AVX5128I, EVEX, EVEX_K; } @@ -4371,7 +4963,7 @@ multiclass avx512_gather_scatter_prefetch opc, Format F, string OpcodeSt RegisterClass KRC, X86MemOperand memop> { let Predicates = [HasPFI], hasSideEffects = 1 in def m : AVX5128I, EVEX, EVEX_K; } @@ -4386,7 +4978,7 @@ defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; - + defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; @@ -4431,14 +5023,14 @@ multiclass avx512_shufp, EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, EVEX_4V, Sched<[WriteShuffle]>; @@ -4461,35 +5053,29 @@ def : Pat<(v8i64 (X86Shufp VR512:$src1, (memopv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; -multiclass avx512_alignr { - def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}"), - []>, EVEX_4V; +multiclass avx512_valign { + defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i8imm:$src3), + "valign"##_.Suffix, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, + (i8 imm:$src3)))>, + AVX512AIi8Base, EVEX_4V; + + // Also match valign of packed floats. + def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>; + let mayLoad = 1 in - def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|" + def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3), + !strconcat("valign"##_.Suffix, + "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}"), []>, EVEX_4V; } -defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; +defm VALIGND : avx512_valign, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VALIGNQ : avx512_valign, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; @@ -4500,43 +5086,43 @@ multiclass avx512_vpabs opc, string OpcodeStr, ValueType OpVT, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { def rr : AVX5128I, EVEX; def rrk : AVX5128I, EVEX, EVEX_K; def rrkz : AVX5128I, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I, EVEX; def rmk : AVX5128I, EVEX, EVEX_K; def rmkz : AVX5128I, EVEX, EVEX_KZ; def rmb : AVX5128I, EVEX, EVEX_B; def rmbk : AVX5128I, EVEX, EVEX_B, EVEX_K; def rmbkz : AVX5128I, EVEX, EVEX_B, EVEX_KZ; @@ -4566,54 +5152,54 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPABSQZrr VR512:$src)>; -multiclass avx512_conflict opc, string OpcodeStr, +multiclass avx512_conflict opc, string OpcodeStr, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { def rr : AVX5128I, EVEX; def rm : AVX5128I, EVEX; def rmb : AVX5128I, EVEX, EVEX_B; def rrkz : AVX5128I, EVEX, EVEX_KZ; def rmkz : AVX5128I, EVEX, EVEX_KZ; def rmbkz : AVX5128I, EVEX, EVEX_KZ, EVEX_B; - + let Constraints = "$src1 = $dst" in { def rrk : AVX5128I, EVEX, EVEX_K; def rmk : AVX5128I, EVEX, EVEX_K; def rmbk : AVX5128I, EVEX, EVEX_K, EVEX_B; } @@ -4687,3 +5273,135 @@ def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), def : Pat<(truncstorei1 GR8:$src, addr:$dst), (MOV8mr addr:$dst, GR8:$src)>; +multiclass cvt_by_vec_width opc, X86VectorVTInfo Vec, string OpcodeStr > { +def rr : AVX512XS8I, EVEX; +} + +multiclass cvt_mask_by_elt_width opc, AVX512VLVectorVTInfo VTInfo, + string OpcodeStr, Predicate prd> { +let Predicates = [prd] in + defm Z : cvt_by_vec_width, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : cvt_by_vec_width, EVEX_V256; + defm Z128 : cvt_by_vec_width, EVEX_V128; + } +} + +multiclass avx512_convert_mask_to_vector { + defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, + HasBWI>; + defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, + HasBWI>, VEX_W; + defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, + HasDQI>; + defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, + HasDQI>, VEX_W; +} + +defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; + +//===----------------------------------------------------------------------===// +// AVX-512 - COMPRESS and EXPAND +// +multiclass compress_by_vec_width opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I, EVEX_K; + + let mayStore = 1 in { + def mrk : AVX5128I, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + } +} + +multiclass compress_by_elt_width opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : compress_by_vec_width, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : compress_by_vec_width, EVEX_V256; + defm Z128 : compress_by_vec_width, EVEX_V128; + } +} + +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>, + EVEX; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>, + EVEX; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, + EVEX, VEX_W; + +// expand +multiclass expand_by_vec_width opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I, EVEX_K; + + let mayLoad = 1, Constraints = "$src0 = $dst" in + def rmk : AVX5128I, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + + let mayLoad = 1 in + def rmkz : AVX5128I, + EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>; + +} + +multiclass expand_by_elt_width opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : expand_by_vec_width, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : expand_by_vec_width, EVEX_V256; + defm Z128 : expand_by_vec_width, EVEX_V128; + } +} + +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>, + EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, + EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, + EVEX, VEX_W;