From 28e12e9f02cf1c5029994b4a6d7e2988512e1310 Mon Sep 17 00:00:00 2001 From: Cameron McInally Date: Fri, 15 Nov 2013 17:01:14 +0000 Subject: [PATCH] Add AVX512 unmasked FMA intrinsics and support. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194824 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 48 +++++++++++ lib/Target/X86/X86ISelLowering.cpp | 26 +++++- test/CodeGen/X86/avx512-fma-intrinsics.ll | 97 +++++++++++++++++++++++ 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/X86/avx512-fma-intrinsics.ll diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 1fe1c91d9f8..4c5718f8c8b 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1864,6 +1864,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_fma_vfmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddps512">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_fma_vfmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddpd512">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty], + [IntrNoMem]>; def int_x86_fma_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -1888,6 +1896,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_fma_vfmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubps512">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_fma_vfmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubpd512">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty], + [IntrNoMem]>; def int_x86_fma_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -1912,6 +1928,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_fma_vfnmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmaddps512">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_fma_vfnmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmaddpd512">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty], + [IntrNoMem]>; def int_x86_fma_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -1936,6 +1960,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_fma_vfnmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmsubps512">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_fma_vfnmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmsubpd512">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty], + [IntrNoMem]>; def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -1954,6 +1986,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_fma_vfmaddsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubps512">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_fma_vfmaddsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd512">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty], + [IntrNoMem]>; def int_x86_fma_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -1972,6 +2012,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_fma_vfmsubadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddps512">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_fma_vfmsubadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd512">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty], + [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index aa9642056c8..9df0232a341 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -11648,7 +11648,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: { + case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_vfmadd_ps_512: + case Intrinsic::x86_fma_vfmadd_pd_512: + case Intrinsic::x86_fma_vfmsub_ps_512: + case Intrinsic::x86_fma_vfmsub_pd_512: + case Intrinsic::x86_fma_vfnmadd_ps_512: + case Intrinsic::x86_fma_vfnmadd_pd_512: + case Intrinsic::x86_fma_vfnmsub_ps_512: + case Intrinsic::x86_fma_vfnmsub_pd_512: + case Intrinsic::x86_fma_vfmaddsub_ps_512: + case Intrinsic::x86_fma_vfmaddsub_pd_512: + case Intrinsic::x86_fma_vfmsubadd_ps_512: + case Intrinsic::x86_fma_vfmsubadd_pd_512: { unsigned Opc; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -11656,36 +11668,48 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmadd_ps_256: case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmadd_ps_512: + case Intrinsic::x86_fma_vfmadd_pd_512: Opc = X86ISD::FMADD; break; case Intrinsic::x86_fma_vfmsub_ps: case Intrinsic::x86_fma_vfmsub_pd: case Intrinsic::x86_fma_vfmsub_ps_256: case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_512: + case Intrinsic::x86_fma_vfmsub_pd_512: Opc = X86ISD::FMSUB; break; case Intrinsic::x86_fma_vfnmadd_ps: case Intrinsic::x86_fma_vfnmadd_pd: case Intrinsic::x86_fma_vfnmadd_ps_256: case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_512: + case Intrinsic::x86_fma_vfnmadd_pd_512: Opc = X86ISD::FNMADD; break; case Intrinsic::x86_fma_vfnmsub_ps: case Intrinsic::x86_fma_vfnmsub_pd: case Intrinsic::x86_fma_vfnmsub_ps_256: case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_512: + case Intrinsic::x86_fma_vfnmsub_pd_512: Opc = X86ISD::FNMSUB; break; case Intrinsic::x86_fma_vfmaddsub_ps: case Intrinsic::x86_fma_vfmaddsub_pd: case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_512: + case Intrinsic::x86_fma_vfmaddsub_pd_512: Opc = X86ISD::FMADDSUB; break; case Intrinsic::x86_fma_vfmsubadd_ps: case Intrinsic::x86_fma_vfmsubadd_pd: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_512: + case Intrinsic::x86_fma_vfmsubadd_pd_512: Opc = X86ISD::FMSUBADD; break; } diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll new file mode 100644 index 00000000000..ce3d7590f39 --- /dev/null +++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -0,0 +1,97 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_x86_vfmadd_ps_z + ; CHECK: vfmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_x86_vfmadd_pd_z + ; CHECK: vfmadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + +define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_x86_vfmsubps_z + ; CHECK: vfmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_x86_vfmsubpd_z + ; CHECK: vfmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + +define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_x86_vfnmadd_ps_z + ; CHECK: vfnmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_x86_vfnmadd_pd_z + ; CHECK: vfnmadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + +define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_x86_vfnmsubps_z + ; CHECK: vfnmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_x86_vfnmsubpd_z + ; CHECK: vfnmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + +define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_x86_vfmaddsubps_z + ; CHECK: vfmaddsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_x86_vfmaddsubpd_z + ; CHECK: vfmaddsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + +define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_x86_vfmsubaddps_z + ; CHECK: vfmsubadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_x86_vfmsubaddpd_z + ; CHECK: vfmsubadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone -- 2.34.1