1 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
4 @g16 = external global i16
6 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
7 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
10 ; X32: pinsrd $1, 4(%esp), %xmm0
12 ; X64-LABEL: pinsrd_1:
13 ; X64: pinsrd $1, %edi, %xmm0
16 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
17 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
19 ; X32-LABEL: pinsrb_1:
20 ; X32: pinsrb $1, 4(%esp), %xmm0
22 ; X64-LABEL: pinsrb_1:
23 ; X64: pinsrb $1, %edi, %xmm0
27 define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
29 %0 = load i32* %p, align 4
30 %1 = insertelement <4 x i32> undef, i32 %0, i32 0
31 %2 = insertelement <4 x i32> %1, i32 0, i32 1
32 %3 = insertelement <4 x i32> %2, i32 0, i32 2
33 %4 = insertelement <4 x i32> %3, i32 0, i32 3
34 %5 = bitcast <4 x i32> %4 to <16 x i8>
35 %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
36 %7 = bitcast <4 x i32> %6 to <2 x i64>
40 ; X32: movl 4(%esp), %eax
41 ; X32: pmovsxbd (%eax), %xmm0
44 ; X64: pmovsxbd (%rdi), %xmm0
47 define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
49 %0 = load i64* %p ; <i64> [#uses=1]
50 %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1]
51 %1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1]
52 %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1]
53 %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1]
57 ; X32: movl 4(%esp), %eax
58 ; X32: pmovsxwd (%eax), %xmm0
61 ; X64: pmovsxwd (%rdi), %xmm0
67 define <2 x i64> @pmovzxbq_1() nounwind {
69 %0 = load i16* @g16, align 2 ; <i16> [#uses=1]
70 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
71 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
72 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
76 ; X32: movl L_g16$non_lazy_ptr, %eax
77 ; X32: pmovzxbq (%eax), %xmm0
80 ; X64: movq _g16@GOTPCREL(%rip), %rax
81 ; X64: pmovzxbq (%rax), %xmm0
84 declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
85 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
86 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
91 define i32 @extractps_1(<4 x float> %v) nounwind {
92 %s = extractelement <4 x float> %v, i32 3
93 %i = bitcast float %s to i32
97 ; X32: extractps $3, %xmm0, %eax
100 ; X64: extractps $3, %xmm0, %eax
102 define i32 @extractps_2(<4 x float> %v) nounwind {
103 %t = bitcast <4 x float> %v to <4 x i32>
104 %s = extractelement <4 x i32> %t, i32 3
108 ; X32: extractps $3, %xmm0, %eax
111 ; X64: extractps $3, %xmm0, %eax
115 ; The non-store form of extractps puts its result into a GPR.
116 ; This makes it suitable for an extract from a <4 x float> that
117 ; is bitcasted to i32, but unsuitable for much of anything else.
119 define float @ext_1(<4 x float> %v) nounwind {
120 %s = extractelement <4 x float> %v, i32 3
121 %t = fadd float %s, 1.0
125 ; X32: pshufd $3, %xmm0, %xmm0
126 ; X32: addss LCPI7_0, %xmm0
129 ; X64: pshufd $3, %xmm0, %xmm0
130 ; X64: addss LCPI7_0(%rip), %xmm0
132 define float @ext_2(<4 x float> %v) nounwind {
133 %s = extractelement <4 x float> %v, i32 3
137 ; X32: pshufd $3, %xmm0, %xmm0
140 ; X64: pshufd $3, %xmm0, %xmm0
142 define i32 @ext_3(<4 x i32> %v) nounwind {
143 %i = extractelement <4 x i32> %v, i32 3
147 ; X32: pextrd $3, %xmm0, %eax
150 ; X64: pextrd $3, %xmm0, %eax
153 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
154 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
155 ret <4 x float> %tmp1
157 ; X32: insertps $1, %xmm1, %xmm0
160 ; X64: insertps $1, %xmm1, %xmm0
163 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
165 define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
166 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
167 ret <4 x float> %tmp1
169 ; X32: insertps $0, 4(%esp), %xmm0
172 ; X64: insertps $0, %xmm1, %xmm0
175 define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
176 %tmp2 = extractelement <4 x float> %t2, i32 0
177 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
178 ret <4 x float> %tmp1
180 ; X32: insertps $0, %xmm1, %xmm0
183 ; X64: insertps $0, %xmm1, %xmm0
186 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
187 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
190 ; X32: ptest %xmm1, %xmm0
194 ; X64: ptest %xmm1, %xmm0
198 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
199 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
202 ; X32: ptest %xmm1, %xmm0
206 ; X64: ptest %xmm1, %xmm0
210 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
211 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
214 ; X32: ptest %xmm1, %xmm0
218 ; X64: ptest %xmm1, %xmm0
223 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
224 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
225 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
227 ; This used to compile to insertps $0 + insertps $16. insertps $0 is always
229 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
231 %tmp7 = extractelement <2 x float> %A, i32 0
232 %tmp5 = extractelement <2 x float> %A, i32 1
233 %tmp3 = extractelement <2 x float> %B, i32 0
234 %tmp1 = extractelement <2 x float> %B, i32 1
235 %add.r = fadd float %tmp7, %tmp3
236 %add.i = fadd float %tmp5, %tmp1
237 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
238 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
239 ret <2 x float> %tmp9
240 ; X32-LABEL: buildvector:
241 ; X32-NOT: insertps $0
243 ; X32-NOT: insertps $0
245 ; X64-LABEL: buildvector:
246 ; X64-NOT: insertps $0
248 ; X64-NOT: insertps $0
252 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
254 %0 = load <4 x float>* %pb, align 16
255 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
256 ret <4 x float> %vecinit6
257 ; CHECK-LABEL: insertps_from_shufflevector_1:
260 ; CHECK: insertps $48,
264 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
266 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
267 ret <4 x float> %vecinit6
268 ; CHECK-LABEL: insertps_from_shufflevector_2:
270 ; CHECK: insertps $96,
274 ; For loading an i32 from memory into an xmm register we use pinsrd
275 ; instead of insertps
276 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
278 %0 = load <4 x i32>* %pb, align 16
279 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
280 ret <4 x i32> %vecinit6
281 ; CHECK-LABEL: pinsrd_from_shufflevector_i32:
288 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
290 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
291 ret <4 x i32> %vecinit6
292 ; CHECK-LABEL: insertps_from_shufflevector_i32_2:
295 ; CHECK: insertps $208,
299 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
300 ; CHECK-LABEL: insertps_from_load_ins_elt_undef:
303 ; CHECK: insertps $16,
305 %1 = load float* %b, align 4
306 %2 = insertelement <4 x float> undef, float %1, i32 0
307 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
308 ret <4 x float> %result
311 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
312 ; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
313 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
316 ; CHECK: insertps $32,
318 %1 = load i32* %b, align 4
319 %2 = insertelement <4 x i32> undef, i32 %1, i32 0
320 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
321 ret <4 x i32> %result
324 ;;;;;; Shuffles optimizable with a single insertps instruction
325 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
326 ; CHECK-LABEL: shuf_XYZ0:
328 ; CHECK-NOT: punpckldq
331 %vecext = extractelement <4 x float> %x, i32 0
332 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
333 %vecext1 = extractelement <4 x float> %x, i32 1
334 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
335 %vecext3 = extractelement <4 x float> %x, i32 2
336 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
337 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
338 ret <4 x float> %vecinit5
341 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
342 ; CHECK-LABEL: shuf_XY00:
344 ; CHECK-NOT: punpckldq
345 ; CHECK: insertps $12
347 %vecext = extractelement <4 x float> %x, i32 0
348 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
349 %vecext1 = extractelement <4 x float> %x, i32 1
350 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
351 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
352 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
353 ret <4 x float> %vecinit4
356 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
357 ; CHECK-LABEL: shuf_XYY0:
359 ; CHECK-NOT: punpckldq
360 ; CHECK: insertps $104
362 %vecext = extractelement <4 x float> %x, i32 0
363 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
364 %vecext1 = extractelement <4 x float> %x, i32 1
365 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
366 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
367 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
368 ret <4 x float> %vecinit5
371 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
372 ; CHECK-LABEL: shuf_XYW0:
373 ; CHECK: insertps $232
375 %vecext = extractelement <4 x float> %x, i32 0
376 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
377 %vecext1 = extractelement <4 x float> %x, i32 1
378 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
379 %vecext2 = extractelement <4 x float> %x, i32 3
380 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
381 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
382 ret <4 x float> %vecinit4
385 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
386 ; CHECK-LABEL: shuf_W00W:
388 ; CHECK-NOT: punpckldq
389 ; CHECK: insertps $198
391 %vecext = extractelement <4 x float> %x, i32 3
392 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
393 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
394 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
395 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
396 ret <4 x float> %vecinit4
399 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
400 ; CHECK-LABEL: shuf_X00A:
403 ; CHECK: insertps $48
405 %vecext = extractelement <4 x float> %x, i32 0
406 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
407 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
408 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
409 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
410 ret <4 x float> %vecinit4
413 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
414 ; CHECK-LABEL: shuf_X00X:
417 ; CHECK: insertps $48
419 %vecext = extractelement <4 x float> %x, i32 0
420 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
421 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
422 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
423 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
424 ret <4 x float> %vecinit4
427 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
428 ; CHECK-LABEL: shuf_X0YC:
432 ; CHECK: insertps $176
434 %vecext = extractelement <4 x float> %x, i32 0
435 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
436 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
437 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
438 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
439 ret <4 x float> %vecinit5
442 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
443 ; CHECK-LABEL: i32_shuf_XYZ0:
445 ; CHECK-NOT: punpckldq
448 %vecext = extractelement <4 x i32> %x, i32 0
449 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
450 %vecext1 = extractelement <4 x i32> %x, i32 1
451 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
452 %vecext3 = extractelement <4 x i32> %x, i32 2
453 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
454 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
455 ret <4 x i32> %vecinit5
458 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
459 ; CHECK-LABEL: i32_shuf_XY00:
461 ; CHECK-NOT: punpckldq
462 ; CHECK: insertps $12
464 %vecext = extractelement <4 x i32> %x, i32 0
465 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
466 %vecext1 = extractelement <4 x i32> %x, i32 1
467 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
468 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
469 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
470 ret <4 x i32> %vecinit4
473 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
474 ; CHECK-LABEL: i32_shuf_XYY0:
476 ; CHECK-NOT: punpckldq
477 ; CHECK: insertps $104
479 %vecext = extractelement <4 x i32> %x, i32 0
480 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
481 %vecext1 = extractelement <4 x i32> %x, i32 1
482 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
483 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
484 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
485 ret <4 x i32> %vecinit5
488 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
489 ; CHECK-LABEL: i32_shuf_XYW0:
491 ; CHECK-NOT: punpckldq
492 ; CHECK: insertps $232
494 %vecext = extractelement <4 x i32> %x, i32 0
495 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
496 %vecext1 = extractelement <4 x i32> %x, i32 1
497 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
498 %vecext2 = extractelement <4 x i32> %x, i32 3
499 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
500 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
501 ret <4 x i32> %vecinit4
504 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
505 ; CHECK-LABEL: i32_shuf_W00W:
507 ; CHECK-NOT: punpckldq
508 ; CHECK: insertps $198
510 %vecext = extractelement <4 x i32> %x, i32 3
511 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
512 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
513 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
514 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
515 ret <4 x i32> %vecinit4
518 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
519 ; CHECK-LABEL: i32_shuf_X00A:
522 ; CHECK: insertps $48
524 %vecext = extractelement <4 x i32> %x, i32 0
525 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
526 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
527 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
528 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
529 ret <4 x i32> %vecinit4
532 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
533 ; CHECK-LABEL: i32_shuf_X00X:
536 ; CHECK: insertps $48
538 %vecext = extractelement <4 x i32> %x, i32 0
539 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
540 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
541 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
542 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
543 ret <4 x i32> %vecinit4
546 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
547 ; CHECK-LABEL: i32_shuf_X0YC:
551 ; CHECK: insertps $176
553 %vecext = extractelement <4 x i32> %x, i32 0
554 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
555 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
556 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
557 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
558 ret <4 x i32> %vecinit5
561 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
562 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
563 ; CHECK-LABEL: test_insertps_no_undef:
564 ; CHECK: movaps %xmm0, %xmm1
565 ; CHECK-NEXT: insertps $8, %xmm1, %xmm1
566 ; CHECK-NEXT: maxps %xmm1, %xmm0
568 %vecext = extractelement <4 x float> %x, i32 0
569 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
570 %vecext1 = extractelement <4 x float> %x, i32 1
571 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
572 %vecext3 = extractelement <4 x float> %x, i32 2
573 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
574 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
575 %mask = fcmp olt <4 x float> %vecinit5, %x
576 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5