1 ; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
3 attributes #0 = { nounwind }
5 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
6 declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
7 declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
8 declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
10 declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
11 declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
12 declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
13 declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
15 define void @fmadd_aab_ss(float* %a, float* %b) #0 {
16 ; CHECK-LABEL: fmadd_aab_ss:
17 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
18 ; CHECK-NEXT: vfmadd213ss (%rdx), %[[XMM]], %[[XMM]]
19 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
21 %a.val = load float, float* %a
22 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
23 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
24 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
25 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
27 %b.val = load float, float* %b
28 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
29 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
30 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
31 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
33 %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
35 %sr = extractelement <4 x float> %vr, i32 0
36 store float %sr, float* %a
40 define void @fmadd_aba_ss(float* %a, float* %b) #0 {
41 ; CHECK-LABEL: fmadd_aba_ss:
42 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
43 ; CHECK-NEXT: vfmadd132ss (%rdx), %[[XMM]], %[[XMM]]
44 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
46 %a.val = load float, float* %a
47 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
48 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
49 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
50 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
52 %b.val = load float, float* %b
53 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
54 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
55 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
56 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
58 %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
60 %sr = extractelement <4 x float> %vr, i32 0
61 store float %sr, float* %a
65 define void @fmsub_aab_ss(float* %a, float* %b) #0 {
66 ; CHECK-LABEL: fmsub_aab_ss:
67 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
68 ; CHECK-NEXT: vfmsub213ss (%rdx), %[[XMM]], %[[XMM]]
69 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
71 %a.val = load float, float* %a
72 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
73 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
74 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
75 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
77 %b.val = load float, float* %b
78 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
79 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
80 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
81 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
83 %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
85 %sr = extractelement <4 x float> %vr, i32 0
86 store float %sr, float* %a
90 define void @fmsub_aba_ss(float* %a, float* %b) #0 {
91 ; CHECK-LABEL: fmsub_aba_ss:
92 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
93 ; CHECK-NEXT: vfmsub132ss (%rdx), %[[XMM]], %[[XMM]]
94 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
96 %a.val = load float, float* %a
97 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
98 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
99 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
100 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
102 %b.val = load float, float* %b
103 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
104 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
105 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
106 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
108 %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
110 %sr = extractelement <4 x float> %vr, i32 0
111 store float %sr, float* %a
115 define void @fnmadd_aab_ss(float* %a, float* %b) #0 {
116 ; CHECK-LABEL: fnmadd_aab_ss:
117 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
118 ; CHECK-NEXT: vfnmadd213ss (%rdx), %[[XMM]], %[[XMM]]
119 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
121 %a.val = load float, float* %a
122 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
123 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
124 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
125 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
127 %b.val = load float, float* %b
128 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
129 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
130 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
131 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
133 %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
135 %sr = extractelement <4 x float> %vr, i32 0
136 store float %sr, float* %a
140 define void @fnmadd_aba_ss(float* %a, float* %b) #0 {
141 ; CHECK-LABEL: fnmadd_aba_ss:
142 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
143 ; CHECK-NEXT: vfnmadd132ss (%rdx), %[[XMM]], %[[XMM]]
144 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
146 %a.val = load float, float* %a
147 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
148 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
149 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
150 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
152 %b.val = load float, float* %b
153 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
154 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
155 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
156 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
158 %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
160 %sr = extractelement <4 x float> %vr, i32 0
161 store float %sr, float* %a
165 define void @fnmsub_aab_ss(float* %a, float* %b) #0 {
166 ; CHECK-LABEL: fnmsub_aab_ss:
167 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
168 ; CHECK-NEXT: vfnmsub213ss (%rdx), %[[XMM]], %[[XMM]]
169 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
171 %a.val = load float, float* %a
172 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
173 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
174 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
175 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
177 %b.val = load float, float* %b
178 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
179 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
180 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
181 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
183 %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
185 %sr = extractelement <4 x float> %vr, i32 0
186 store float %sr, float* %a
190 define void @fnmsub_aba_ss(float* %a, float* %b) #0 {
191 ; CHECK-LABEL: fnmsub_aba_ss:
192 ; CHECK: vmovss (%rcx), %[[XMM:xmm[0-9]+]]
193 ; CHECK-NEXT: vfnmsub132ss (%rdx), %[[XMM]], %[[XMM]]
194 ; CHECK-NEXT: vmovss %[[XMM]], (%rcx)
196 %a.val = load float, float* %a
197 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
198 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
199 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
200 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
202 %b.val = load float, float* %b
203 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
204 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
205 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
206 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
208 %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
210 %sr = extractelement <4 x float> %vr, i32 0
211 store float %sr, float* %a
215 define void @fmadd_aab_sd(double* %a, double* %b) #0 {
216 ; CHECK-LABEL: fmadd_aab_sd:
217 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
218 ; CHECK-NEXT: vfmadd213sd (%rdx), %[[XMM]], %[[XMM]]
219 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
221 %a.val = load double, double* %a
222 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
223 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
225 %b.val = load double, double* %b
226 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
227 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
229 %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
231 %sr = extractelement <2 x double> %vr, i32 0
232 store double %sr, double* %a
236 define void @fmadd_aba_sd(double* %a, double* %b) #0 {
237 ; CHECK-LABEL: fmadd_aba_sd:
238 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
239 ; CHECK-NEXT: vfmadd132sd (%rdx), %[[XMM]], %[[XMM]]
240 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
242 %a.val = load double, double* %a
243 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
244 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
246 %b.val = load double, double* %b
247 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
248 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
250 %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
252 %sr = extractelement <2 x double> %vr, i32 0
253 store double %sr, double* %a
257 define void @fmsub_aab_sd(double* %a, double* %b) #0 {
258 ; CHECK-LABEL: fmsub_aab_sd:
259 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
260 ; CHECK-NEXT: vfmsub213sd (%rdx), %[[XMM]], %[[XMM]]
261 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
263 %a.val = load double, double* %a
264 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
265 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
267 %b.val = load double, double* %b
268 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
269 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
271 %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
273 %sr = extractelement <2 x double> %vr, i32 0
274 store double %sr, double* %a
278 define void @fmsub_aba_sd(double* %a, double* %b) #0 {
279 ; CHECK-LABEL: fmsub_aba_sd:
280 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
281 ; CHECK-NEXT: vfmsub132sd (%rdx), %[[XMM]], %[[XMM]]
282 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
284 %a.val = load double, double* %a
285 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
286 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
288 %b.val = load double, double* %b
289 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
290 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
292 %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
294 %sr = extractelement <2 x double> %vr, i32 0
295 store double %sr, double* %a
299 define void @fnmadd_aab_sd(double* %a, double* %b) #0 {
300 ; CHECK-LABEL: fnmadd_aab_sd:
301 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
302 ; CHECK-NEXT: vfnmadd213sd (%rdx), %[[XMM]], %[[XMM]]
303 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
305 %a.val = load double, double* %a
306 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
307 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
309 %b.val = load double, double* %b
310 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
311 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
313 %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
315 %sr = extractelement <2 x double> %vr, i32 0
316 store double %sr, double* %a
320 define void @fnmadd_aba_sd(double* %a, double* %b) #0 {
321 ; CHECK-LABEL: fnmadd_aba_sd:
322 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
323 ; CHECK-NEXT: vfnmadd132sd (%rdx), %[[XMM]], %[[XMM]]
324 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
326 %a.val = load double, double* %a
327 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
328 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
330 %b.val = load double, double* %b
331 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
332 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
334 %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
336 %sr = extractelement <2 x double> %vr, i32 0
337 store double %sr, double* %a
341 define void @fnmsub_aab_sd(double* %a, double* %b) #0 {
342 ; CHECK-LABEL: fnmsub_aab_sd:
343 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
344 ; CHECK-NEXT: vfnmsub213sd (%rdx), %[[XMM]], %[[XMM]]
345 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
347 %a.val = load double, double* %a
348 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
349 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
351 %b.val = load double, double* %b
352 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
353 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
355 %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
357 %sr = extractelement <2 x double> %vr, i32 0
358 store double %sr, double* %a
362 define void @fnmsub_aba_sd(double* %a, double* %b) #0 {
363 ; CHECK-LABEL: fnmsub_aba_sd:
364 ; CHECK: vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
365 ; CHECK-NEXT: vfnmsub132sd (%rdx), %[[XMM]], %[[XMM]]
366 ; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
368 %a.val = load double, double* %a
369 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
370 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
372 %b.val = load double, double* %b
373 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
374 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
376 %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
378 %sr = extractelement <2 x double> %vr, i32 0
379 store double %sr, double* %a