1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
8 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
11 define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
12 ; FMA-LABEL: test_f32_fmadd:
14 ; FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
17 ; FMA4-LABEL: test_f32_fmadd:
19 ; FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
22 ; AVX512-LABEL: test_f32_fmadd:
24 ; AVX512-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
25 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
27 %x = fmul float %a0, %a1
28 %res = fadd float %x, %a2
32 define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
33 ; FMA-LABEL: test_4f32_fmadd:
35 ; FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
38 ; FMA4-LABEL: test_4f32_fmadd:
40 ; FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
43 ; AVX512-LABEL: test_4f32_fmadd:
45 ; AVX512-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
47 %x = fmul <4 x float> %a0, %a1
48 %res = fadd <4 x float> %x, %a2
52 define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
53 ; FMA-LABEL: test_8f32_fmadd:
55 ; FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
58 ; FMA4-LABEL: test_8f32_fmadd:
60 ; FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
63 ; AVX512-LABEL: test_8f32_fmadd:
65 ; AVX512-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
67 %x = fmul <8 x float> %a0, %a1
68 %res = fadd <8 x float> %x, %a2
72 define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
73 ; FMA-LABEL: test_f64_fmadd:
75 ; FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
78 ; FMA4-LABEL: test_f64_fmadd:
80 ; FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
83 ; AVX512-LABEL: test_f64_fmadd:
85 ; AVX512-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
86 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
88 %x = fmul double %a0, %a1
89 %res = fadd double %x, %a2
93 define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
94 ; FMA-LABEL: test_2f64_fmadd:
96 ; FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
99 ; FMA4-LABEL: test_2f64_fmadd:
101 ; FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
104 ; AVX512-LABEL: test_2f64_fmadd:
106 ; AVX512-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
108 %x = fmul <2 x double> %a0, %a1
109 %res = fadd <2 x double> %x, %a2
110 ret <2 x double> %res
113 define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
114 ; FMA-LABEL: test_4f64_fmadd:
116 ; FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
119 ; FMA4-LABEL: test_4f64_fmadd:
121 ; FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
124 ; AVX512-LABEL: test_4f64_fmadd:
126 ; AVX512-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
128 %x = fmul <4 x double> %a0, %a1
129 %res = fadd <4 x double> %x, %a2
130 ret <4 x double> %res
134 ; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z)
137 define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
138 ; FMA-LABEL: test_f32_fmsub:
140 ; FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
143 ; FMA4-LABEL: test_f32_fmsub:
145 ; FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
148 ; AVX512-LABEL: test_f32_fmsub:
150 ; AVX512-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
151 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
153 %x = fmul float %a0, %a1
154 %res = fsub float %x, %a2
158 define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
159 ; FMA-LABEL: test_4f32_fmsub:
161 ; FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
164 ; FMA4-LABEL: test_4f32_fmsub:
166 ; FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
169 ; AVX512-LABEL: test_4f32_fmsub:
171 ; AVX512-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
173 %x = fmul <4 x float> %a0, %a1
174 %res = fsub <4 x float> %x, %a2
178 define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
179 ; FMA-LABEL: test_8f32_fmsub:
181 ; FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
184 ; FMA4-LABEL: test_8f32_fmsub:
186 ; FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
189 ; AVX512-LABEL: test_8f32_fmsub:
191 ; AVX512-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
193 %x = fmul <8 x float> %a0, %a1
194 %res = fsub <8 x float> %x, %a2
198 define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
199 ; FMA-LABEL: test_f64_fmsub:
201 ; FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
204 ; FMA4-LABEL: test_f64_fmsub:
206 ; FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
209 ; AVX512-LABEL: test_f64_fmsub:
211 ; AVX512-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
212 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
214 %x = fmul double %a0, %a1
215 %res = fsub double %x, %a2
219 define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
220 ; FMA-LABEL: test_2f64_fmsub:
222 ; FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
225 ; FMA4-LABEL: test_2f64_fmsub:
227 ; FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
230 ; AVX512-LABEL: test_2f64_fmsub:
232 ; AVX512-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
234 %x = fmul <2 x double> %a0, %a1
235 %res = fsub <2 x double> %x, %a2
236 ret <2 x double> %res
239 define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
240 ; FMA-LABEL: test_4f64_fmsub:
242 ; FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
245 ; FMA4-LABEL: test_4f64_fmsub:
247 ; FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
250 ; AVX512-LABEL: test_4f64_fmsub:
252 ; AVX512-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
254 %x = fmul <4 x double> %a0, %a1
255 %res = fsub <4 x double> %x, %a2
256 ret <4 x double> %res
260 ; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z)
263 define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
264 ; FMA-LABEL: test_f32_fnmadd:
266 ; FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
269 ; FMA4-LABEL: test_f32_fnmadd:
271 ; FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
274 ; AVX512-LABEL: test_f32_fnmadd:
276 ; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
277 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
279 %x = fmul float %a0, %a1
280 %res = fsub float %a2, %x
284 define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
285 ; FMA-LABEL: test_4f32_fnmadd:
287 ; FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
290 ; FMA4-LABEL: test_4f32_fnmadd:
292 ; FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
295 ; AVX512-LABEL: test_4f32_fnmadd:
297 ; AVX512-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
299 %x = fmul <4 x float> %a0, %a1
300 %res = fsub <4 x float> %a2, %x
304 define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
305 ; FMA-LABEL: test_8f32_fnmadd:
307 ; FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
310 ; FMA4-LABEL: test_8f32_fnmadd:
312 ; FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
315 ; AVX512-LABEL: test_8f32_fnmadd:
317 ; AVX512-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
319 %x = fmul <8 x float> %a0, %a1
320 %res = fsub <8 x float> %a2, %x
324 define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
325 ; FMA-LABEL: test_f64_fnmadd:
327 ; FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
330 ; FMA4-LABEL: test_f64_fnmadd:
332 ; FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
335 ; AVX512-LABEL: test_f64_fnmadd:
337 ; AVX512-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
338 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
340 %x = fmul double %a0, %a1
341 %res = fsub double %a2, %x
345 define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
346 ; FMA-LABEL: test_2f64_fnmadd:
348 ; FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
351 ; FMA4-LABEL: test_2f64_fnmadd:
353 ; FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
356 ; AVX512-LABEL: test_2f64_fnmadd:
358 ; AVX512-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
360 %x = fmul <2 x double> %a0, %a1
361 %res = fsub <2 x double> %a2, %x
362 ret <2 x double> %res
365 define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
366 ; FMA-LABEL: test_4f64_fnmadd:
368 ; FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
371 ; FMA4-LABEL: test_4f64_fnmadd:
373 ; FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
376 ; AVX512-LABEL: test_4f64_fnmadd:
378 ; AVX512-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
380 %x = fmul <4 x double> %a0, %a1
381 %res = fsub <4 x double> %a2, %x
382 ret <4 x double> %res
386 ; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z)
389 define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
390 ; FMA-LABEL: test_f32_fnmsub:
392 ; FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
395 ; FMA4-LABEL: test_f32_fnmsub:
397 ; FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
400 ; AVX512-LABEL: test_f32_fnmsub:
402 ; AVX512-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
403 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
405 %x = fmul float %a0, %a1
406 %y = fsub float -0.000000e+00, %x
407 %res = fsub float %y, %a2
411 define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
412 ; FMA-LABEL: test_4f32_fnmsub:
414 ; FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
417 ; FMA4-LABEL: test_4f32_fnmsub:
419 ; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
422 ; AVX512-LABEL: test_4f32_fnmsub:
424 ; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
426 %x = fmul <4 x float> %a0, %a1
427 %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
428 %res = fsub <4 x float> %y, %a2
432 define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
433 ; FMA-LABEL: test_8f32_fnmsub:
435 ; FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
438 ; FMA4-LABEL: test_8f32_fnmsub:
440 ; FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
443 ; AVX512-LABEL: test_8f32_fnmsub:
445 ; AVX512-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
447 %x = fmul <8 x float> %a0, %a1
448 %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
449 %res = fsub <8 x float> %y, %a2
453 define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
454 ; FMA-LABEL: test_f64_fnmsub:
456 ; FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
459 ; FMA4-LABEL: test_f64_fnmsub:
461 ; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
464 ; AVX512-LABEL: test_f64_fnmsub:
466 ; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
467 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
469 %x = fmul double %a0, %a1
470 %y = fsub double -0.000000e+00, %x
471 %res = fsub double %y, %a2
475 define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
476 ; FMA-LABEL: test_2f64_fnmsub:
478 ; FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
481 ; FMA4-LABEL: test_2f64_fnmsub:
483 ; FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
486 ; AVX512-LABEL: test_2f64_fnmsub:
488 ; AVX512-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
490 %x = fmul <2 x double> %a0, %a1
491 %y = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x
492 %res = fsub <2 x double> %y, %a2
493 ret <2 x double> %res
496 define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
497 ; FMA-LABEL: test_4f64_fnmsub:
499 ; FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
502 ; FMA4-LABEL: test_4f64_fnmsub:
504 ; FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
507 ; AVX512-LABEL: test_4f64_fnmsub:
509 ; AVX512-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
511 %x = fmul <4 x double> %a0, %a1
512 %y = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
513 %res = fsub <4 x double> %y, %a2
514 ret <4 x double> %res
518 ; Load Folding Patterns
521 define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
522 ; FMA-LABEL: test_4f32_fmadd_load:
524 ; FMA-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0
527 ; FMA4-LABEL: test_4f32_fmadd_load:
529 ; FMA4-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
532 ; AVX512-LABEL: test_4f32_fmadd_load:
534 ; AVX512-NEXT: vmovaps (%rdi), %xmm2
535 ; AVX512-NEXT: vfmadd213ps %xmm1, %xmm0, %xmm2
536 ; AVX512-NEXT: vmovaps %zmm2, %zmm0
538 %x = load <4 x float>, <4 x float>* %a0
539 %y = fmul <4 x float> %x, %a1
540 %res = fadd <4 x float> %y, %a2
544 define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, <2 x double> %a2) {
545 ; FMA-LABEL: test_2f64_fmsub_load:
547 ; FMA-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0
550 ; FMA4-LABEL: test_2f64_fmsub_load:
552 ; FMA4-NEXT: vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
555 ; AVX512-LABEL: test_2f64_fmsub_load:
557 ; AVX512-NEXT: vmovapd (%rdi), %xmm2
558 ; AVX512-NEXT: vfmsub213pd %xmm1, %xmm0, %xmm2
559 ; AVX512-NEXT: vmovaps %zmm2, %zmm0
561 %x = load <2 x double>, <2 x double>* %a0
562 %y = fmul <2 x double> %x, %a1
563 %res = fsub <2 x double> %y, %a2
564 ret <2 x double> %res
568 ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
571 define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
572 ; FMA-LABEL: test_v4f32_mul_add_x_one_y:
574 ; FMA-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
577 ; FMA4-LABEL: test_v4f32_mul_add_x_one_y:
579 ; FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
582 ; AVX512-LABEL: test_v4f32_mul_add_x_one_y:
584 ; AVX512-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
586 %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
587 %m = fmul <4 x float> %a, %y
591 define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
592 ; FMA-LABEL: test_v4f32_mul_y_add_x_one:
594 ; FMA-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
597 ; FMA4-LABEL: test_v4f32_mul_y_add_x_one:
599 ; FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
602 ; AVX512-LABEL: test_v4f32_mul_y_add_x_one:
604 ; AVX512-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
606 %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
607 %m = fmul <4 x float> %y, %a
611 define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
612 ; FMA-LABEL: test_v4f32_mul_add_x_negone_y:
614 ; FMA-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
617 ; FMA4-LABEL: test_v4f32_mul_add_x_negone_y:
619 ; FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
622 ; AVX512-LABEL: test_v4f32_mul_add_x_negone_y:
624 ; AVX512-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
626 %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
627 %m = fmul <4 x float> %a, %y
631 define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) {
632 ; FMA-LABEL: test_v4f32_mul_y_add_x_negone:
634 ; FMA-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
637 ; FMA4-LABEL: test_v4f32_mul_y_add_x_negone:
639 ; FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
642 ; AVX512-LABEL: test_v4f32_mul_y_add_x_negone:
644 ; AVX512-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
646 %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
647 %m = fmul <4 x float> %y, %a
651 define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
652 ; FMA-LABEL: test_v4f32_mul_sub_one_x_y:
654 ; FMA-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
657 ; FMA4-LABEL: test_v4f32_mul_sub_one_x_y:
659 ; FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
662 ; AVX512-LABEL: test_v4f32_mul_sub_one_x_y:
664 ; AVX512-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
666 %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
667 %m = fmul <4 x float> %s, %y
671 define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
672 ; FMA-LABEL: test_v4f32_mul_y_sub_one_x:
674 ; FMA-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
677 ; FMA4-LABEL: test_v4f32_mul_y_sub_one_x:
679 ; FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
682 ; AVX512-LABEL: test_v4f32_mul_y_sub_one_x:
684 ; AVX512-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
686 %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
687 %m = fmul <4 x float> %y, %s
691 define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
692 ; FMA-LABEL: test_v4f32_mul_sub_negone_x_y:
694 ; FMA-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
697 ; FMA4-LABEL: test_v4f32_mul_sub_negone_x_y:
699 ; FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
702 ; AVX512-LABEL: test_v4f32_mul_sub_negone_x_y:
704 ; AVX512-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
706 %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
707 %m = fmul <4 x float> %s, %y
711 define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
712 ; FMA-LABEL: test_v4f32_mul_y_sub_negone_x:
714 ; FMA-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
717 ; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x:
719 ; FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
722 ; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x:
724 ; AVX512-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
726 %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
727 %m = fmul <4 x float> %y, %s
731 define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
732 ; FMA-LABEL: test_v4f32_mul_sub_x_one_y:
734 ; FMA-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
737 ; FMA4-LABEL: test_v4f32_mul_sub_x_one_y:
739 ; FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
742 ; AVX512-LABEL: test_v4f32_mul_sub_x_one_y:
744 ; AVX512-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
746 %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
747 %m = fmul <4 x float> %s, %y
751 define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
752 ; FMA-LABEL: test_v4f32_mul_y_sub_x_one:
754 ; FMA-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
757 ; FMA4-LABEL: test_v4f32_mul_y_sub_x_one:
759 ; FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
762 ; AVX512-LABEL: test_v4f32_mul_y_sub_x_one:
764 ; AVX512-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
766 %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
767 %m = fmul <4 x float> %y, %s
771 define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
772 ; FMA-LABEL: test_v4f32_mul_sub_x_negone_y:
774 ; FMA-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
777 ; FMA4-LABEL: test_v4f32_mul_sub_x_negone_y:
779 ; FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
782 ; AVX512-LABEL: test_v4f32_mul_sub_x_negone_y:
784 ; AVX512-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
786 %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
787 %m = fmul <4 x float> %s, %y
791 define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) {
792 ; FMA-LABEL: test_v4f32_mul_y_sub_x_negone:
794 ; FMA-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
797 ; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone:
799 ; FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
802 ; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone:
804 ; AVX512-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
806 %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
807 %m = fmul <4 x float> %y, %s
812 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
815 define float @test_f32_interp(float %x, float %y, float %t) {
816 ; FMA-LABEL: test_f32_interp:
818 ; FMA-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
819 ; FMA-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0
822 ; FMA4-LABEL: test_f32_interp:
824 ; FMA4-NEXT: vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1
825 ; FMA4-NEXT: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
828 ; AVX512-LABEL: test_f32_interp:
830 ; AVX512-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
831 ; AVX512-NEXT: vfmadd213ss %xmm1, %xmm0, %xmm2
832 ; AVX512-NEXT: vmovaps %zmm2, %zmm0
834 %t1 = fsub float 1.0, %t
835 %tx = fmul float %x, %t
836 %ty = fmul float %y, %t1
837 %r = fadd float %tx, %ty
841 define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
842 ; FMA-LABEL: test_v4f32_interp:
844 ; FMA-NEXT: vfnmadd213ps %xmm1, %xmm2, %xmm1
845 ; FMA-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
848 ; FMA4-LABEL: test_v4f32_interp:
850 ; FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1
851 ; FMA4-NEXT: vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
854 ; AVX512-LABEL: test_v4f32_interp:
856 ; AVX512-NEXT: vmovaps %zmm2, %zmm3
857 ; AVX512-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm3
858 ; AVX512-NEXT: vfmadd213ps %xmm3, %xmm2, %xmm0
860 %t1 = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
861 %tx = fmul <4 x float> %x, %t
862 %ty = fmul <4 x float> %y, %t1
863 %r = fadd <4 x float> %tx, %ty
867 define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
868 ; FMA-LABEL: test_v8f32_interp:
870 ; FMA-NEXT: vfnmadd213ps %ymm1, %ymm2, %ymm1
871 ; FMA-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0
874 ; FMA4-LABEL: test_v8f32_interp:
876 ; FMA4-NEXT: vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1
877 ; FMA4-NEXT: vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
880 ; AVX512-LABEL: test_v8f32_interp:
882 ; AVX512-NEXT: vmovaps %zmm2, %zmm3
883 ; AVX512-NEXT: vfnmadd213ps %ymm1, %ymm1, %ymm3
884 ; AVX512-NEXT: vfmadd213ps %ymm3, %ymm2, %ymm0
886 %t1 = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
887 %tx = fmul <8 x float> %x, %t
888 %ty = fmul <8 x float> %y, %t1
889 %r = fadd <8 x float> %tx, %ty
893 define double @test_f64_interp(double %x, double %y, double %t) {
894 ; FMA-LABEL: test_f64_interp:
896 ; FMA-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
897 ; FMA-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0
900 ; FMA4-LABEL: test_f64_interp:
902 ; FMA4-NEXT: vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1
903 ; FMA4-NEXT: vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
906 ; AVX512-LABEL: test_f64_interp:
908 ; AVX512-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
909 ; AVX512-NEXT: vfmadd213sd %xmm1, %xmm0, %xmm2
910 ; AVX512-NEXT: vmovaps %zmm2, %zmm0
912 %t1 = fsub double 1.0, %t
913 %tx = fmul double %x, %t
914 %ty = fmul double %y, %t1
915 %r = fadd double %tx, %ty
919 define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
920 ; FMA-LABEL: test_v2f64_interp:
922 ; FMA-NEXT: vfnmadd213pd %xmm1, %xmm2, %xmm1
923 ; FMA-NEXT: vfmadd213pd %xmm1, %xmm2, %xmm0
926 ; FMA4-LABEL: test_v2f64_interp:
928 ; FMA4-NEXT: vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1
929 ; FMA4-NEXT: vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
932 ; AVX512-LABEL: test_v2f64_interp:
934 ; AVX512-NEXT: vmovaps %zmm2, %zmm3
935 ; AVX512-NEXT: vfnmadd213pd %xmm1, %xmm1, %xmm3
936 ; AVX512-NEXT: vfmadd213pd %xmm3, %xmm2, %xmm0
938 %t1 = fsub <2 x double> <double 1.0, double 1.0>, %t
939 %tx = fmul <2 x double> %x, %t
940 %ty = fmul <2 x double> %y, %t1
941 %r = fadd <2 x double> %tx, %ty
945 define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
946 ; FMA-LABEL: test_v4f64_interp:
948 ; FMA-NEXT: vfnmadd213pd %ymm1, %ymm2, %ymm1
949 ; FMA-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0
952 ; FMA4-LABEL: test_v4f64_interp:
954 ; FMA4-NEXT: vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1
955 ; FMA4-NEXT: vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
958 ; AVX512-LABEL: test_v4f64_interp:
960 ; AVX512-NEXT: vmovaps %zmm2, %zmm3
961 ; AVX512-NEXT: vfnmadd213pd %ymm1, %ymm1, %ymm3
962 ; AVX512-NEXT: vfmadd213pd %ymm3, %ymm2, %ymm0
964 %t1 = fsub <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
965 %tx = fmul <4 x double> %x, %t
966 %ty = fmul <4 x double> %y, %t1
967 %r = fadd <4 x double> %tx, %ty
972 ; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
975 define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
976 ; FMA-LABEL: test_v4f32_fneg_fmadd:
978 ; FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
981 ; FMA4-LABEL: test_v4f32_fneg_fmadd:
983 ; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
986 ; AVX512-LABEL: test_v4f32_fneg_fmadd:
988 ; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
990 %mul = fmul <4 x float> %a0, %a1
991 %add = fadd <4 x float> %mul, %a2
992 %neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
996 define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
997 ; FMA-LABEL: test_v4f64_fneg_fmsub:
999 ; FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
1002 ; FMA4-LABEL: test_v4f64_fneg_fmsub:
1004 ; FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
1007 ; AVX512-LABEL: test_v4f64_fneg_fmsub:
1009 ; AVX512-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
1011 %mul = fmul <4 x double> %a0, %a1
1012 %sub = fsub <4 x double> %mul, %a2
1013 %neg = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
1014 ret <4 x double> %neg
1017 define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
1018 ; FMA-LABEL: test_v4f32_fneg_fnmadd:
1020 ; FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
1023 ; FMA4-LABEL: test_v4f32_fneg_fnmadd:
1025 ; FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
1028 ; AVX512-LABEL: test_v4f32_fneg_fnmadd:
1030 ; AVX512-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
1032 %mul = fmul <4 x float> %a0, %a1
1033 %neg0 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
1034 %add = fadd <4 x float> %neg0, %a2
1035 %neg1 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
1036 ret <4 x float> %neg1
1039 define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
1040 ; FMA-LABEL: test_v4f64_fneg_fnmsub:
1042 ; FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
1045 ; FMA4-LABEL: test_v4f64_fneg_fnmsub:
1047 ; FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
1050 ; AVX512-LABEL: test_v4f64_fneg_fnmsub:
1052 ; AVX512-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
1054 %mul = fmul <4 x double> %a0, %a1
1055 %neg0 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
1056 %sub = fsub <4 x double> %neg0, %a2
1057 %neg1 = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
1058 ret <4 x double> %neg1
1062 ; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
1065 define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
1066 ; FMA-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1068 ; FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
1071 ; FMA4-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1073 ; FMA4-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
1076 ; AVX512-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1078 ; AVX512-NEXT: vmulps {{.*}}(%rip){1to4}, %xmm0, %xmm0
1080 %m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
1081 %m1 = fmul <4 x float> %x, <float 4.0, float 3.0, float 2.0, float 1.0>
1082 %a = fadd <4 x float> %m0, %m1
1087 ; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
1090 define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
1091 ; FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1093 ; FMA-NEXT: vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0
1096 ; FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1098 ; FMA4-NEXT: vfmaddps %xmm1, {{.*}}(%rip), %xmm0, %xmm0
1101 ; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1103 ; AVX512-NEXT: vfmadd231ps {{.*}}(%rip), %xmm0, %xmm1
1104 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
1106 %m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
1107 %m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
1108 %a = fadd <4 x float> %m1, %y
1112 ; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
1114 define double @test_f64_fneg_fmul(double %x, double %y) #0 {
1115 ; FMA-LABEL: test_f64_fneg_fmul:
1117 ; FMA-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1118 ; FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
1121 ; FMA4-LABEL: test_f64_fneg_fmul:
1123 ; FMA4-NEXT: vxorpd %xmm2, %xmm2, %xmm2
1124 ; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
1127 ; AVX512-LABEL: test_f64_fneg_fmul:
1129 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
1130 ; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
1131 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
1133 %m = fmul nsz double %x, %y
1134 %n = fsub double -0.0, %m
1138 define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
1139 ; FMA-LABEL: test_v4f32_fneg_fmul:
1141 ; FMA-NEXT: vxorps %xmm2, %xmm2, %xmm2
1142 ; FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
1145 ; FMA4-LABEL: test_v4f32_fneg_fmul:
1147 ; FMA4-NEXT: vxorps %xmm2, %xmm2, %xmm2
1148 ; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
1151 ; AVX512-LABEL: test_v4f32_fneg_fmul:
1153 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
1154 ; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
1156 %m = fmul nsz <4 x float> %x, %y
1157 %n = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
1161 define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
1162 ; FMA-LABEL: test_v4f64_fneg_fmul:
1164 ; FMA-NEXT: vxorpd %ymm2, %ymm2, %ymm2
1165 ; FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
1168 ; FMA4-LABEL: test_v4f64_fneg_fmul:
1170 ; FMA4-NEXT: vxorpd %ymm2, %ymm2, %ymm2
1171 ; FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
1174 ; AVX512-LABEL: test_v4f64_fneg_fmul:
1176 ; AVX512-NEXT: vxorps %ymm2, %ymm2, %ymm2
1177 ; AVX512-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
1179 %m = fmul nsz <4 x double> %x, %y
1180 %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
1184 define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
1185 ; ALL-LABEL: test_v4f64_fneg_fmul_no_nsz:
1187 ; ALL-NEXT: vmulpd %ymm1, %ymm0, %ymm0
1188 ; ALL-NEXT: vxorpd {{.*}}(%rip), %ymm0, %ymm0
1190 %m = fmul <4 x double> %x, %y
1191 %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
1195 attributes #0 = { "unsafe-fp-math"="true" }