From 002683abc710f7fcaf921ac62fcb44e5b043362c Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 5 Feb 2014 07:05:03 +0000 Subject: [PATCH] AVX-512: Added intrinsic for cvtph2ps. Added VPTESTNM instruction. Added a pattern to vselect (lit tests will follow). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200823 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 12 ++-- lib/Target/X86/X86ISelLowering.h | 3 +- lib/Target/X86/X86InstrAVX512.td | 82 ++++++++++++++++++------- lib/Target/X86/X86InstrFragmentsSIMD.td | 3 + test/CodeGen/X86/avx512-intrinsics.ll | 13 ++-- test/CodeGen/X86/avx512-shuffle.ll | 14 ++--- 6 files changed, 85 insertions(+), 42 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 183317b23fa..7bbfcb9a963 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -2648,6 +2648,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256">, Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_vcvtph2ps_512 : GCCBuiltin<"__builtin_ia32_vcvtph2ps512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty, llvm_v16f32_ty, + llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512_mask">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty, + llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// @@ -2756,12 +2762,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_cvtusi642sd : GCCBuiltin<"__builtin_ia32_cvtusi642sd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i64_ty], [IntrNoMem]>; - - def int_x86_avx512_vcvtph2ps_512 : GCCBuiltin<"__builtin_ia32_vcvtph2ps512">, - Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty], [IntrNoMem]>; - def int_x86_avx512_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty], - [IntrNoMem]>; } // Vector convert diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 5fb628f166b..90ac9b57753 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -312,8 +312,9 @@ namespace llvm { // TESTP - Vector packed fp sign bitwise comparisons. TESTP, - // TESTM - Vector "test" in AVX-512, the result is in a mask vector. + // TESTM, TESTNM - Vector "test" in AVX-512, the result is in a mask vector. TESTM, + TESTNM, // OR/AND test for masks KORTEST, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index fefa5fc6bc3..2f9c0578cfe 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -613,13 +613,13 @@ defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512me defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERM2D : avx512_perm_3src<0x7E, "vperm2d", VR512, memopv16i32, i512mem, +defm VPERMT2D : avx512_perm_3src<0x7E, "vpermt2d", VR512, memopv16i32, i512mem, X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERM2Q : avx512_perm_3src<0x7E, "vperm2q", VR512, memopv8i64, i512mem, +defm VPERMT2Q : avx512_perm_3src<0x7E, "vpermt2q", VR512, memopv8i64, i512mem, X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERM2PS : avx512_perm_3src<0x7F, "vperm2ps", VR512, memopv16f32, i512mem, +defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512mem, X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERM2PD : avx512_perm_3src<0x7F, "vperm2pd", VR512, memopv8f64, i512mem, +defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem, X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask @@ -1332,6 +1332,11 @@ let Constraints = "$src1 = $dst" in { " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; } + def rrkz : AVX512XSI, + EVEX, EVEX_KZ; } defm VMOVDQU32 : avx512_mov_int<0x6F, 0x7F, "vmovdqu32", VR512, VK16WM, @@ -1351,6 +1356,23 @@ def : Pat<(store (v16i32 VR512:$src), addr:$dst), (VMOVDQU32mr addr:$dst, VR512:$src)>; let AddedComplexity = 20 in { +def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64rrkz VK8WM:$mask, VR512:$src)>; + +def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), + (v8i64 VR512:$src))), + (VMOVDQU64rrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), + VK8), VR512:$src)>; + +def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src), + (v16i32 immAllZerosV))), + (VMOVDQU32rrkz VK16WM:$mask, VR512:$src)>; + +def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), + (v16i32 VR512:$src))), + (VMOVDQU32rrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; + def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), (v16f32 VR512:$src2))), (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; @@ -2118,24 +2140,34 @@ def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), multiclass avx512_vptest opc, string OpcodeStr, RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, SDNode OpNode, ValueType vt> { - def rr : AVX5128I, EVEX_4V; - def rm : AVX5128I, EVEX_4V; + def rm : AVX512PI, EVEX_4V; + (bitconvert (memop_frag addr:$src2))))], SSEPackedInt>, EVEX_4V; } defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem, - memopv16i32, X86testm, v16i32>, EVEX_V512, + memopv16i32, X86testm, v16i32>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, - memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W, + memopv8i64, X86testm, v8i64>, T8XS, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +let Predicates = [HasCDI] in { +defm VPTESTNMDZ : avx512_vptest<0x27, "vptestnmd", VK16, VR512, f512mem, + memopv16i32, X86testnm, v16i32>, T8XS, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPTESTNMQZ : avx512_vptest<0x27, "vptestnmq", VK8, VR512, f512mem, + memopv8i64, X86testnm, v8i64>, T8PD, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +} + def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))), (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>; @@ -2997,35 +3029,41 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_f16c_ph2ps { +multiclass avx512_cvtph2ps { def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set destRC:$dst, (Int srcRC:$src))]>, EVEX; + []>, EVEX; let hasSideEffects = 0, mayLoad = 1 in def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX; } -multiclass avx512_f16c_ps2ph { +multiclass avx512_cvtps2ph { def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), (ins srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set destRC:$dst, (Int srcRC:$src1, imm:$src2))]>, EVEX; + "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX; let hasSideEffects = 0, mayStore = 1 in def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; + "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; } -defm VCVTPH2PSZ : avx512_f16c_ph2ps, EVEX_V512, +defm VCVTPH2PSZ : avx512_cvtph2ps, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VCVTPS2PHZ : avx512_f16c_ps2ph, EVEX_V512, +defm VCVTPS2PHZ : avx512_cvtps2ph, EVEX_V512, EVEX_CD8<32, CD8VH>; +def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src), + imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))), + (VCVTPS2PHZrr VR512:$src, imm:$rc)>; + +def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src), + (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))), + (VCVTPH2PSZrr VR256X:$src)>; + let Defs = [EFLAGS], Predicates = [HasAVX512] in { defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, "ucomiss">, TB, EVEX, VEX_LIG, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 3ab1ea7134f..551b446c81f 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -177,6 +177,9 @@ def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>>; +def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisVec<1>, + SDTCisSameAs<2, 1>]>>; def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 8d569d66c39..b9bc2b47c63 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -220,19 +220,20 @@ define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) { - ; CHECK: vcvtph2ps - %res = call <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16> %a0) + ; CHECK: vcvtph2ps %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x13,0xc0] + %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ret <16 x float> %res } -declare <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16>) nounwind readonly +declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) { - ; CHECK: vcvtps2ph - %res = call <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float> %a0, i32 0) + ; CHECK: vcvtps2ph $2, %zmm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1d,0xc0,0x02] + %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res } -declare <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float>, i32) nounwind readonly + +declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) { ; CHECK: vbroadcastss diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 7356c1c05e3..59d7010c8d1 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -49,7 +49,7 @@ define <8 x double> @test4(<8 x double> %a) nounwind { } ; CHECK-LABEL: test5: -; CHECK: vperm2pd +; CHECK: vpermt2pd ; CHECK: ret define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -65,7 +65,7 @@ define <8 x i64> @test6(<8 x i64> %a) nounwind { } ; CHECK-LABEL: test7: -; CHECK: vperm2q +; CHECK: vpermt2q ; CHECK: ret define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -73,7 +73,7 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { } ; CHECK-LABEL: test8: -; CHECK: vperm2d +; CHECK: vpermt2d ; CHECK: ret define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> @@ -81,7 +81,7 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { } ; CHECK-LABEL: test9: -; CHECK: vperm2ps +; CHECK: vpermt2ps ; CHECK: ret define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> @@ -89,7 +89,7 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { } ; CHECK-LABEL: test10: -; CHECK: vperm2ps ( +; CHECK: vpermt2ps ( ; CHECK: ret define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { %c = load <16 x float>* %b @@ -98,7 +98,7 @@ define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { } ; CHECK-LABEL: test11: -; CHECK: vperm2d +; CHECK: vpermt2d ; CHECK: ret define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { %c = load <16 x i32>* %b @@ -202,7 +202,7 @@ define <16 x float> @test23(<16 x float> %a, <16 x float> %c) { } ; CHECK-LABEL: @test24 -; CHECK: vperm2d +; CHECK: vpermt2d ; CHECK: ret define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> -- 2.34.1