1 ; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
2 ; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
3 ; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s
5 target triple = "x86_64-unknown-unknown"
7 ; Ensure that the backend no longer emits unnecessary vector insert
8 ; instructions immediately after SSE scalar fp instructions
11 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
12 ; SSE-LABEL: test_add_ss:
14 ; SSE-NEXT: addss %xmm1, %xmm0
17 ; AVX-LABEL: test_add_ss:
19 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
21 %1 = extractelement <4 x float> %b, i32 0
22 %2 = extractelement <4 x float> %a, i32 0
23 %add = fadd float %2, %1
24 %3 = insertelement <4 x float> %a, float %add, i32 0
28 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
29 ; SSE-LABEL: test_sub_ss:
31 ; SSE-NEXT: subss %xmm1, %xmm0
34 ; AVX-LABEL: test_sub_ss:
36 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
38 %1 = extractelement <4 x float> %b, i32 0
39 %2 = extractelement <4 x float> %a, i32 0
40 %sub = fsub float %2, %1
41 %3 = insertelement <4 x float> %a, float %sub, i32 0
45 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
46 ; SSE-LABEL: test_mul_ss:
48 ; SSE-NEXT: mulss %xmm1, %xmm0
51 ; AVX-LABEL: test_mul_ss:
53 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
55 %1 = extractelement <4 x float> %b, i32 0
56 %2 = extractelement <4 x float> %a, i32 0
57 %mul = fmul float %2, %1
58 %3 = insertelement <4 x float> %a, float %mul, i32 0
62 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
63 ; SSE-LABEL: test_div_ss:
65 ; SSE-NEXT: divss %xmm1, %xmm0
68 ; AVX-LABEL: test_div_ss:
70 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
72 %1 = extractelement <4 x float> %b, i32 0
73 %2 = extractelement <4 x float> %a, i32 0
74 %div = fdiv float %2, %1
75 %3 = insertelement <4 x float> %a, float %div, i32 0
79 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
80 ; SSE-LABEL: test_add_sd:
82 ; SSE-NEXT: addsd %xmm1, %xmm0
85 ; AVX-LABEL: test_add_sd:
87 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
89 %1 = extractelement <2 x double> %b, i32 0
90 %2 = extractelement <2 x double> %a, i32 0
91 %add = fadd double %2, %1
92 %3 = insertelement <2 x double> %a, double %add, i32 0
96 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
97 ; SSE-LABEL: test_sub_sd:
99 ; SSE-NEXT: subsd %xmm1, %xmm0
102 ; AVX-LABEL: test_sub_sd:
104 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
106 %1 = extractelement <2 x double> %b, i32 0
107 %2 = extractelement <2 x double> %a, i32 0
108 %sub = fsub double %2, %1
109 %3 = insertelement <2 x double> %a, double %sub, i32 0
113 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
114 ; SSE-LABEL: test_mul_sd:
116 ; SSE-NEXT: mulsd %xmm1, %xmm0
119 ; AVX-LABEL: test_mul_sd:
121 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
123 %1 = extractelement <2 x double> %b, i32 0
124 %2 = extractelement <2 x double> %a, i32 0
125 %mul = fmul double %2, %1
126 %3 = insertelement <2 x double> %a, double %mul, i32 0
130 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
131 ; SSE-LABEL: test_div_sd:
133 ; SSE-NEXT: divsd %xmm1, %xmm0
136 ; AVX-LABEL: test_div_sd:
138 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
140 %1 = extractelement <2 x double> %b, i32 0
141 %2 = extractelement <2 x double> %a, i32 0
142 %div = fdiv double %2, %1
143 %3 = insertelement <2 x double> %a, double %div, i32 0
147 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
148 ; SSE-LABEL: test2_add_ss:
150 ; SSE-NEXT: addss %xmm0, %xmm1
151 ; SSE-NEXT: movaps %xmm1, %xmm0
154 ; AVX-LABEL: test2_add_ss:
156 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
158 %1 = extractelement <4 x float> %a, i32 0
159 %2 = extractelement <4 x float> %b, i32 0
160 %add = fadd float %1, %2
161 %3 = insertelement <4 x float> %b, float %add, i32 0
165 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
166 ; SSE-LABEL: test2_sub_ss:
168 ; SSE-NEXT: subss %xmm0, %xmm1
169 ; SSE-NEXT: movaps %xmm1, %xmm0
172 ; AVX-LABEL: test2_sub_ss:
174 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
176 %1 = extractelement <4 x float> %a, i32 0
177 %2 = extractelement <4 x float> %b, i32 0
178 %sub = fsub float %2, %1
179 %3 = insertelement <4 x float> %b, float %sub, i32 0
183 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
184 ; SSE-LABEL: test2_mul_ss:
186 ; SSE-NEXT: mulss %xmm0, %xmm1
187 ; SSE-NEXT: movaps %xmm1, %xmm0
190 ; AVX-LABEL: test2_mul_ss:
192 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
194 %1 = extractelement <4 x float> %a, i32 0
195 %2 = extractelement <4 x float> %b, i32 0
196 %mul = fmul float %1, %2
197 %3 = insertelement <4 x float> %b, float %mul, i32 0
201 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
202 ; SSE-LABEL: test2_div_ss:
204 ; SSE-NEXT: divss %xmm0, %xmm1
205 ; SSE-NEXT: movaps %xmm1, %xmm0
208 ; AVX-LABEL: test2_div_ss:
210 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
212 %1 = extractelement <4 x float> %a, i32 0
213 %2 = extractelement <4 x float> %b, i32 0
214 %div = fdiv float %2, %1
215 %3 = insertelement <4 x float> %b, float %div, i32 0
219 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
220 ; SSE-LABEL: test2_add_sd:
222 ; SSE-NEXT: addsd %xmm0, %xmm1
223 ; SSE-NEXT: movaps %xmm1, %xmm0
226 ; AVX-LABEL: test2_add_sd:
228 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
230 %1 = extractelement <2 x double> %a, i32 0
231 %2 = extractelement <2 x double> %b, i32 0
232 %add = fadd double %1, %2
233 %3 = insertelement <2 x double> %b, double %add, i32 0
237 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
238 ; SSE-LABEL: test2_sub_sd:
240 ; SSE-NEXT: subsd %xmm0, %xmm1
241 ; SSE-NEXT: movaps %xmm1, %xmm0
244 ; AVX-LABEL: test2_sub_sd:
246 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
248 %1 = extractelement <2 x double> %a, i32 0
249 %2 = extractelement <2 x double> %b, i32 0
250 %sub = fsub double %2, %1
251 %3 = insertelement <2 x double> %b, double %sub, i32 0
255 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
256 ; SSE-LABEL: test2_mul_sd:
258 ; SSE-NEXT: mulsd %xmm0, %xmm1
259 ; SSE-NEXT: movaps %xmm1, %xmm0
262 ; AVX-LABEL: test2_mul_sd:
264 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
266 %1 = extractelement <2 x double> %a, i32 0
267 %2 = extractelement <2 x double> %b, i32 0
268 %mul = fmul double %1, %2
269 %3 = insertelement <2 x double> %b, double %mul, i32 0
273 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
274 ; SSE-LABEL: test2_div_sd:
276 ; SSE-NEXT: divsd %xmm0, %xmm1
277 ; SSE-NEXT: movaps %xmm1, %xmm0
280 ; AVX-LABEL: test2_div_sd:
282 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
284 %1 = extractelement <2 x double> %a, i32 0
285 %2 = extractelement <2 x double> %b, i32 0
286 %div = fdiv double %2, %1
287 %3 = insertelement <2 x double> %b, double %div, i32 0
291 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
292 ; SSE-LABEL: test_multiple_add_ss:
294 ; SSE-NEXT: addss %xmm0, %xmm1
295 ; SSE-NEXT: addss %xmm1, %xmm0
298 ; AVX-LABEL: test_multiple_add_ss:
300 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
301 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
303 %1 = extractelement <4 x float> %b, i32 0
304 %2 = extractelement <4 x float> %a, i32 0
305 %add = fadd float %2, %1
306 %add2 = fadd float %2, %add
307 %3 = insertelement <4 x float> %a, float %add2, i32 0
311 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
312 ; SSE-LABEL: test_multiple_sub_ss:
314 ; SSE-NEXT: movaps %xmm0, %xmm2
315 ; SSE-NEXT: subss %xmm1, %xmm2
316 ; SSE-NEXT: subss %xmm2, %xmm0
319 ; AVX-LABEL: test_multiple_sub_ss:
321 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1
322 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
324 %1 = extractelement <4 x float> %b, i32 0
325 %2 = extractelement <4 x float> %a, i32 0
326 %sub = fsub float %2, %1
327 %sub2 = fsub float %2, %sub
328 %3 = insertelement <4 x float> %a, float %sub2, i32 0
332 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
333 ; SSE-LABEL: test_multiple_mul_ss:
335 ; SSE-NEXT: mulss %xmm0, %xmm1
336 ; SSE-NEXT: mulss %xmm1, %xmm0
339 ; AVX-LABEL: test_multiple_mul_ss:
341 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
342 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
344 %1 = extractelement <4 x float> %b, i32 0
345 %2 = extractelement <4 x float> %a, i32 0
346 %mul = fmul float %2, %1
347 %mul2 = fmul float %2, %mul
348 %3 = insertelement <4 x float> %a, float %mul2, i32 0
352 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
353 ; SSE-LABEL: test_multiple_div_ss:
355 ; SSE-NEXT: movaps %xmm0, %xmm2
356 ; SSE-NEXT: divss %xmm1, %xmm2
357 ; SSE-NEXT: divss %xmm2, %xmm0
360 ; AVX-LABEL: test_multiple_div_ss:
362 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1
363 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
365 %1 = extractelement <4 x float> %b, i32 0
366 %2 = extractelement <4 x float> %a, i32 0
367 %div = fdiv float %2, %1
368 %div2 = fdiv float %2, %div
369 %3 = insertelement <4 x float> %a, float %div2, i32 0
373 ; With SSE4.1 or greater, the shuffles in the following tests may
374 ; be lowered to X86Blendi nodes.
376 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
377 ; SSE-LABEL: blend_add_ss:
379 ; SSE-NEXT: addss %xmm1, %xmm0
382 ; AVX-LABEL: blend_add_ss:
384 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
387 %ext = extractelement <4 x float> %a, i32 0
388 %op = fadd float %b, %ext
389 %ins = insertelement <4 x float> undef, float %op, i32 0
390 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
391 ret <4 x float> %shuf
394 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
395 ; SSE-LABEL: blend_sub_ss:
397 ; SSE-NEXT: subss %xmm1, %xmm0
400 ; AVX-LABEL: blend_sub_ss:
402 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
405 %ext = extractelement <4 x float> %a, i32 0
406 %op = fsub float %ext, %b
407 %ins = insertelement <4 x float> undef, float %op, i32 0
408 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
409 ret <4 x float> %shuf
412 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
413 ; SSE-LABEL: blend_mul_ss:
415 ; SSE-NEXT: mulss %xmm1, %xmm0
418 ; AVX-LABEL: blend_mul_ss:
420 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
423 %ext = extractelement <4 x float> %a, i32 0
424 %op = fmul float %b, %ext
425 %ins = insertelement <4 x float> undef, float %op, i32 0
426 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
427 ret <4 x float> %shuf
430 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
431 ; SSE-LABEL: blend_div_ss:
433 ; SSE-NEXT: divss %xmm1, %xmm0
436 ; AVX-LABEL: blend_div_ss:
438 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
441 %ext = extractelement <4 x float> %a, i32 0
442 %op = fdiv float %ext, %b
443 %ins = insertelement <4 x float> undef, float %op, i32 0
444 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
445 ret <4 x float> %shuf
448 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
449 ; SSE-LABEL: blend_add_sd:
451 ; SSE-NEXT: addsd %xmm1, %xmm0
454 ; AVX-LABEL: blend_add_sd:
456 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
459 %ext = extractelement <2 x double> %a, i32 0
460 %op = fadd double %b, %ext
461 %ins = insertelement <2 x double> undef, double %op, i32 0
462 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
463 ret <2 x double> %shuf
466 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
467 ; SSE-LABEL: blend_sub_sd:
469 ; SSE-NEXT: subsd %xmm1, %xmm0
472 ; AVX-LABEL: blend_sub_sd:
474 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
477 %ext = extractelement <2 x double> %a, i32 0
478 %op = fsub double %ext, %b
479 %ins = insertelement <2 x double> undef, double %op, i32 0
480 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
481 ret <2 x double> %shuf
484 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
485 ; SSE-LABEL: blend_mul_sd:
487 ; SSE-NEXT: mulsd %xmm1, %xmm0
490 ; AVX-LABEL: blend_mul_sd:
492 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
495 %ext = extractelement <2 x double> %a, i32 0
496 %op = fmul double %b, %ext
497 %ins = insertelement <2 x double> undef, double %op, i32 0
498 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
499 ret <2 x double> %shuf
502 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
503 ; SSE-LABEL: blend_div_sd:
505 ; SSE-NEXT: divsd %xmm1, %xmm0
508 ; AVX-LABEL: blend_div_sd:
510 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
513 %ext = extractelement <2 x double> %a, i32 0
514 %op = fdiv double %ext, %b
515 %ins = insertelement <2 x double> undef, double %op, i32 0
516 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
517 ret <2 x double> %shuf
520 ; Ensure that the backend selects SSE/AVX scalar fp instructions
521 ; from a packed fp instruction plus a vector insert.
523 define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
524 ; SSE-LABEL: insert_test_add_ss:
526 ; SSE-NEXT: addss %xmm1, %xmm0
529 ; AVX-LABEL: insert_test_add_ss:
531 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
533 %1 = fadd <4 x float> %a, %b
534 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
538 define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
539 ; SSE-LABEL: insert_test_sub_ss:
541 ; SSE-NEXT: subss %xmm1, %xmm0
544 ; AVX-LABEL: insert_test_sub_ss:
546 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
548 %1 = fsub <4 x float> %a, %b
549 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
553 define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
554 ; SSE-LABEL: insert_test_mul_ss:
556 ; SSE-NEXT: mulss %xmm1, %xmm0
559 ; AVX-LABEL: insert_test_mul_ss:
561 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
563 %1 = fmul <4 x float> %a, %b
564 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
568 define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
569 ; SSE-LABEL: insert_test_div_ss:
571 ; SSE-NEXT: divss %xmm1, %xmm0
574 ; AVX-LABEL: insert_test_div_ss:
576 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
578 %1 = fdiv <4 x float> %a, %b
579 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
583 define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
584 ; SSE-LABEL: insert_test_add_sd:
586 ; SSE-NEXT: addsd %xmm1, %xmm0
589 ; AVX-LABEL: insert_test_add_sd:
591 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
593 %1 = fadd <2 x double> %a, %b
594 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
598 define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
599 ; SSE-LABEL: insert_test_sub_sd:
601 ; SSE-NEXT: subsd %xmm1, %xmm0
604 ; AVX-LABEL: insert_test_sub_sd:
606 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
608 %1 = fsub <2 x double> %a, %b
609 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
613 define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
614 ; SSE-LABEL: insert_test_mul_sd:
616 ; SSE-NEXT: mulsd %xmm1, %xmm0
619 ; AVX-LABEL: insert_test_mul_sd:
621 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
623 %1 = fmul <2 x double> %a, %b
624 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
628 define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
629 ; SSE-LABEL: insert_test_div_sd:
631 ; SSE-NEXT: divsd %xmm1, %xmm0
634 ; AVX-LABEL: insert_test_div_sd:
636 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
638 %1 = fdiv <2 x double> %a, %b
639 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
643 define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
644 ; SSE-LABEL: insert_test2_add_ss:
646 ; SSE-NEXT: addss %xmm0, %xmm1
647 ; SSE-NEXT: movaps %xmm1, %xmm0
650 ; AVX-LABEL: insert_test2_add_ss:
652 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
654 %1 = fadd <4 x float> %b, %a
655 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
659 define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
660 ; SSE-LABEL: insert_test2_sub_ss:
662 ; SSE-NEXT: subss %xmm0, %xmm1
663 ; SSE-NEXT: movaps %xmm1, %xmm0
666 ; AVX-LABEL: insert_test2_sub_ss:
668 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
670 %1 = fsub <4 x float> %b, %a
671 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
675 define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
676 ; SSE-LABEL: insert_test2_mul_ss:
678 ; SSE-NEXT: mulss %xmm0, %xmm1
679 ; SSE-NEXT: movaps %xmm1, %xmm0
682 ; AVX-LABEL: insert_test2_mul_ss:
684 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
686 %1 = fmul <4 x float> %b, %a
687 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
691 define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
692 ; SSE-LABEL: insert_test2_div_ss:
694 ; SSE-NEXT: divss %xmm0, %xmm1
695 ; SSE-NEXT: movaps %xmm1, %xmm0
698 ; AVX-LABEL: insert_test2_div_ss:
700 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
702 %1 = fdiv <4 x float> %b, %a
703 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
707 define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
708 ; SSE-LABEL: insert_test2_add_sd:
710 ; SSE-NEXT: addsd %xmm0, %xmm1
711 ; SSE-NEXT: movaps %xmm1, %xmm0
714 ; AVX-LABEL: insert_test2_add_sd:
716 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
718 %1 = fadd <2 x double> %b, %a
719 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
723 define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
724 ; SSE-LABEL: insert_test2_sub_sd:
726 ; SSE-NEXT: subsd %xmm0, %xmm1
727 ; SSE-NEXT: movaps %xmm1, %xmm0
730 ; AVX-LABEL: insert_test2_sub_sd:
732 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
734 %1 = fsub <2 x double> %b, %a
735 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
739 define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
740 ; SSE-LABEL: insert_test2_mul_sd:
742 ; SSE-NEXT: mulsd %xmm0, %xmm1
743 ; SSE-NEXT: movaps %xmm1, %xmm0
746 ; AVX-LABEL: insert_test2_mul_sd:
748 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
750 %1 = fmul <2 x double> %b, %a
751 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
755 define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
756 ; SSE-LABEL: insert_test2_div_sd:
758 ; SSE-NEXT: divsd %xmm0, %xmm1
759 ; SSE-NEXT: movaps %xmm1, %xmm0
762 ; AVX-LABEL: insert_test2_div_sd:
764 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
766 %1 = fdiv <2 x double> %b, %a
767 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
771 define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
772 ; SSE-LABEL: insert_test3_add_ss:
774 ; SSE-NEXT: addss %xmm1, %xmm0
777 ; AVX-LABEL: insert_test3_add_ss:
779 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
781 %1 = fadd <4 x float> %a, %b
782 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
786 define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
787 ; SSE-LABEL: insert_test3_sub_ss:
789 ; SSE-NEXT: subss %xmm1, %xmm0
792 ; AVX-LABEL: insert_test3_sub_ss:
794 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
796 %1 = fsub <4 x float> %a, %b
797 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
801 define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
802 ; SSE-LABEL: insert_test3_mul_ss:
804 ; SSE-NEXT: mulss %xmm1, %xmm0
807 ; AVX-LABEL: insert_test3_mul_ss:
809 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
811 %1 = fmul <4 x float> %a, %b
812 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
816 define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
817 ; SSE-LABEL: insert_test3_div_ss:
819 ; SSE-NEXT: divss %xmm1, %xmm0
822 ; AVX-LABEL: insert_test3_div_ss:
824 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
826 %1 = fdiv <4 x float> %a, %b
827 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
831 define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
832 ; SSE-LABEL: insert_test3_add_sd:
834 ; SSE-NEXT: addsd %xmm1, %xmm0
837 ; AVX-LABEL: insert_test3_add_sd:
839 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
841 %1 = fadd <2 x double> %a, %b
842 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
846 define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
847 ; SSE-LABEL: insert_test3_sub_sd:
849 ; SSE-NEXT: subsd %xmm1, %xmm0
852 ; AVX-LABEL: insert_test3_sub_sd:
854 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
856 %1 = fsub <2 x double> %a, %b
857 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
861 define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
862 ; SSE-LABEL: insert_test3_mul_sd:
864 ; SSE-NEXT: mulsd %xmm1, %xmm0
867 ; AVX-LABEL: insert_test3_mul_sd:
869 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
871 %1 = fmul <2 x double> %a, %b
872 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
876 define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
877 ; SSE-LABEL: insert_test3_div_sd:
879 ; SSE-NEXT: divsd %xmm1, %xmm0
882 ; AVX-LABEL: insert_test3_div_sd:
884 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
886 %1 = fdiv <2 x double> %a, %b
887 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
891 define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
892 ; SSE-LABEL: insert_test4_add_ss:
894 ; SSE-NEXT: addss %xmm0, %xmm1
895 ; SSE-NEXT: movaps %xmm1, %xmm0
898 ; AVX-LABEL: insert_test4_add_ss:
900 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
902 %1 = fadd <4 x float> %b, %a
903 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
907 define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
908 ; SSE-LABEL: insert_test4_sub_ss:
910 ; SSE-NEXT: subss %xmm0, %xmm1
911 ; SSE-NEXT: movaps %xmm1, %xmm0
914 ; AVX-LABEL: insert_test4_sub_ss:
916 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
918 %1 = fsub <4 x float> %b, %a
919 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
923 define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
924 ; SSE-LABEL: insert_test4_mul_ss:
926 ; SSE-NEXT: mulss %xmm0, %xmm1
927 ; SSE-NEXT: movaps %xmm1, %xmm0
930 ; AVX-LABEL: insert_test4_mul_ss:
932 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
934 %1 = fmul <4 x float> %b, %a
935 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
939 define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
940 ; SSE-LABEL: insert_test4_div_ss:
942 ; SSE-NEXT: divss %xmm0, %xmm1
943 ; SSE-NEXT: movaps %xmm1, %xmm0
946 ; AVX-LABEL: insert_test4_div_ss:
948 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
950 %1 = fdiv <4 x float> %b, %a
951 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
955 define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
956 ; SSE-LABEL: insert_test4_add_sd:
958 ; SSE-NEXT: addsd %xmm0, %xmm1
959 ; SSE-NEXT: movaps %xmm1, %xmm0
962 ; AVX-LABEL: insert_test4_add_sd:
964 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
966 %1 = fadd <2 x double> %b, %a
967 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
971 define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
972 ; SSE-LABEL: insert_test4_sub_sd:
974 ; SSE-NEXT: subsd %xmm0, %xmm1
975 ; SSE-NEXT: movaps %xmm1, %xmm0
978 ; AVX-LABEL: insert_test4_sub_sd:
980 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
982 %1 = fsub <2 x double> %b, %a
983 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
987 define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
988 ; SSE-LABEL: insert_test4_mul_sd:
990 ; SSE-NEXT: mulsd %xmm0, %xmm1
991 ; SSE-NEXT: movaps %xmm1, %xmm0
994 ; AVX-LABEL: insert_test4_mul_sd:
996 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
998 %1 = fmul <2 x double> %b, %a
999 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1003 define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
1004 ; SSE-LABEL: insert_test4_div_sd:
1006 ; SSE-NEXT: divsd %xmm0, %xmm1
1007 ; SSE-NEXT: movaps %xmm1, %xmm0
1010 ; AVX-LABEL: insert_test4_div_sd:
1012 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
1014 %1 = fdiv <2 x double> %b, %a
1015 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1