1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=AVX512
8 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
11 define <16 x float> @test_16f32_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
12 ; FMA-LABEL: test_16f32_fmadd:
14 ; FMA-NEXT: vfmadd213ps %ymm4, %ymm2, %ymm0
15 ; FMA-NEXT: vfmadd213ps %ymm5, %ymm3, %ymm1
18 ; FMA4-LABEL: test_16f32_fmadd:
20 ; FMA4-NEXT: vfmaddps %ymm4, %ymm2, %ymm0, %ymm0
21 ; FMA4-NEXT: vfmaddps %ymm5, %ymm3, %ymm1, %ymm1
24 ; AVX512-LABEL: test_16f32_fmadd:
26 ; AVX512-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
28 %x = fmul <16 x float> %a0, %a1
29 %res = fadd <16 x float> %x, %a2
33 define <8 x double> @test_8f64_fmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
34 ; FMA-LABEL: test_8f64_fmadd:
36 ; FMA-NEXT: vfmadd213pd %ymm4, %ymm2, %ymm0
37 ; FMA-NEXT: vfmadd213pd %ymm5, %ymm3, %ymm1
40 ; FMA4-LABEL: test_8f64_fmadd:
42 ; FMA4-NEXT: vfmaddpd %ymm4, %ymm2, %ymm0, %ymm0
43 ; FMA4-NEXT: vfmaddpd %ymm5, %ymm3, %ymm1, %ymm1
46 ; AVX512-LABEL: test_8f64_fmadd:
48 ; AVX512-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
50 %x = fmul <8 x double> %a0, %a1
51 %res = fadd <8 x double> %x, %a2
56 ; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z)
59 define <16 x float> @test_16f32_fmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
60 ; FMA-LABEL: test_16f32_fmsub:
62 ; FMA-NEXT: vfmsub213ps %ymm4, %ymm2, %ymm0
63 ; FMA-NEXT: vfmsub213ps %ymm5, %ymm3, %ymm1
66 ; FMA4-LABEL: test_16f32_fmsub:
68 ; FMA4-NEXT: vfmsubps %ymm4, %ymm2, %ymm0, %ymm0
69 ; FMA4-NEXT: vfmsubps %ymm5, %ymm3, %ymm1, %ymm1
72 ; AVX512-LABEL: test_16f32_fmsub:
74 ; AVX512-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
76 %x = fmul <16 x float> %a0, %a1
77 %res = fsub <16 x float> %x, %a2
81 define <8 x double> @test_8f64_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
82 ; FMA-LABEL: test_8f64_fmsub:
84 ; FMA-NEXT: vfmsub213pd %ymm4, %ymm2, %ymm0
85 ; FMA-NEXT: vfmsub213pd %ymm5, %ymm3, %ymm1
88 ; FMA4-LABEL: test_8f64_fmsub:
90 ; FMA4-NEXT: vfmsubpd %ymm4, %ymm2, %ymm0, %ymm0
91 ; FMA4-NEXT: vfmsubpd %ymm5, %ymm3, %ymm1, %ymm1
94 ; AVX512-LABEL: test_8f64_fmsub:
96 ; AVX512-NEXT: vfmsub213pd %zmm2, %zmm1, %zmm0
98 %x = fmul <8 x double> %a0, %a1
99 %res = fsub <8 x double> %x, %a2
100 ret <8 x double> %res
104 ; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z)
107 define <16 x float> @test_16f32_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
108 ; FMA-LABEL: test_16f32_fnmadd:
110 ; FMA-NEXT: vfnmadd213ps %ymm4, %ymm2, %ymm0
111 ; FMA-NEXT: vfnmadd213ps %ymm5, %ymm3, %ymm1
114 ; FMA4-LABEL: test_16f32_fnmadd:
116 ; FMA4-NEXT: vfnmaddps %ymm4, %ymm2, %ymm0, %ymm0
117 ; FMA4-NEXT: vfnmaddps %ymm5, %ymm3, %ymm1, %ymm1
120 ; AVX512-LABEL: test_16f32_fnmadd:
122 ; AVX512-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0
124 %x = fmul <16 x float> %a0, %a1
125 %res = fsub <16 x float> %a2, %x
126 ret <16 x float> %res
129 define <8 x double> @test_8f64_fnmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
130 ; FMA-LABEL: test_8f64_fnmadd:
132 ; FMA-NEXT: vfnmadd213pd %ymm4, %ymm2, %ymm0
133 ; FMA-NEXT: vfnmadd213pd %ymm5, %ymm3, %ymm1
136 ; FMA4-LABEL: test_8f64_fnmadd:
138 ; FMA4-NEXT: vfnmaddpd %ymm4, %ymm2, %ymm0, %ymm0
139 ; FMA4-NEXT: vfnmaddpd %ymm5, %ymm3, %ymm1, %ymm1
142 ; AVX512-LABEL: test_8f64_fnmadd:
144 ; AVX512-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0
146 %x = fmul <8 x double> %a0, %a1
147 %res = fsub <8 x double> %a2, %x
148 ret <8 x double> %res
152 ; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z)
155 define <16 x float> @test_16f32_fnmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
156 ; FMA-LABEL: test_16f32_fnmsub:
158 ; FMA-NEXT: vfnmsub213ps %ymm4, %ymm2, %ymm0
159 ; FMA-NEXT: vfnmsub213ps %ymm5, %ymm3, %ymm1
162 ; FMA4-LABEL: test_16f32_fnmsub:
164 ; FMA4-NEXT: vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
165 ; FMA4-NEXT: vfnmsubps %ymm5, %ymm3, %ymm1, %ymm1
168 ; AVX512-LABEL: test_16f32_fnmsub:
170 ; AVX512-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
172 %x = fmul <16 x float> %a0, %a1
173 %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
174 %res = fsub <16 x float> %y, %a2
175 ret <16 x float> %res
178 define <8 x double> @test_8f64_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
179 ; FMA-LABEL: test_8f64_fnmsub:
181 ; FMA-NEXT: vfnmsub213pd %ymm4, %ymm2, %ymm0
182 ; FMA-NEXT: vfnmsub213pd %ymm5, %ymm3, %ymm1
185 ; FMA4-LABEL: test_8f64_fnmsub:
187 ; FMA4-NEXT: vfnmsubpd %ymm4, %ymm2, %ymm0, %ymm0
188 ; FMA4-NEXT: vfnmsubpd %ymm5, %ymm3, %ymm1, %ymm1
191 ; AVX512-LABEL: test_8f64_fnmsub:
193 ; AVX512-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
195 %x = fmul <8 x double> %a0, %a1
196 %y = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
197 %res = fsub <8 x double> %y, %a2
198 ret <8 x double> %res
202 ; Load Folding Patterns
205 define <16 x float> @test_16f32_fmadd_load(<16 x float>* %a0, <16 x float> %a1, <16 x float> %a2) {
206 ; FMA-LABEL: test_16f32_fmadd_load:
208 ; FMA-NEXT: vfmadd132ps (%rdi), %ymm2, %ymm0
209 ; FMA-NEXT: vfmadd132ps 32(%rdi), %ymm3, %ymm1
212 ; FMA4-LABEL: test_16f32_fmadd_load:
214 ; FMA4-NEXT: vfmaddps %ymm2, (%rdi), %ymm0, %ymm0
215 ; FMA4-NEXT: vfmaddps %ymm3, 32(%rdi), %ymm1, %ymm1
218 ; AVX512-LABEL: test_16f32_fmadd_load:
220 ; AVX512-NEXT: vmovaps (%rdi), %zmm2
221 ; AVX512-NEXT: vfmadd213ps %zmm1, %zmm0, %zmm2
222 ; AVX512-NEXT: vmovaps %zmm2, %zmm0
224 %x = load <16 x float>, <16 x float>* %a0
225 %y = fmul <16 x float> %x, %a1
226 %res = fadd <16 x float> %y, %a2
227 ret <16 x float> %res
230 define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, <8 x double> %a2) {
231 ; FMA-LABEL: test_8f64_fmsub_load:
233 ; FMA-NEXT: vfmsub132pd (%rdi), %ymm2, %ymm0
234 ; FMA-NEXT: vfmsub132pd 32(%rdi), %ymm3, %ymm1
237 ; FMA4-LABEL: test_8f64_fmsub_load:
239 ; FMA4-NEXT: vfmsubpd %ymm2, (%rdi), %ymm0, %ymm0
240 ; FMA4-NEXT: vfmsubpd %ymm3, 32(%rdi), %ymm1, %ymm1
243 ; AVX512-LABEL: test_8f64_fmsub_load:
245 ; AVX512-NEXT: vmovapd (%rdi), %zmm2
246 ; AVX512-NEXT: vfmsub213pd %zmm1, %zmm0, %zmm2
247 ; AVX512-NEXT: vmovaps %zmm2, %zmm0
249 %x = load <8 x double>, <8 x double>* %a0
250 %y = fmul <8 x double> %x, %a1
251 %res = fsub <8 x double> %y, %a2
252 ret <8 x double> %res