1 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
4 @g16 = external global i16
6 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
9 ; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
12 ; X64-LABEL: pinsrd_1:
14 ; X64-NEXT: pinsrd $1, %edi, %xmm0
16 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
20 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
21 ; X32-LABEL: pinsrb_1:
23 ; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
26 ; X64-LABEL: pinsrb_1:
28 ; X64-NEXT: pinsrb $1, %edi, %xmm0
30 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
34 define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
35 ; X32-LABEL: pmovsxbd_1:
36 ; X32: ## BB#0: ## %entry
37 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
38 ; X32-NEXT: pmovsxbd (%eax), %xmm0
41 ; X64-LABEL: pmovsxbd_1:
42 ; X64: ## BB#0: ## %entry
43 ; X64-NEXT: pmovsxbd (%rdi), %xmm0
46 %0 = load i32, i32* %p, align 4
47 %1 = insertelement <4 x i32> undef, i32 %0, i32 0
48 %2 = insertelement <4 x i32> %1, i32 0, i32 1
49 %3 = insertelement <4 x i32> %2, i32 0, i32 2
50 %4 = insertelement <4 x i32> %3, i32 0, i32 3
51 %5 = bitcast <4 x i32> %4 to <16 x i8>
52 %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
53 %7 = bitcast <4 x i32> %6 to <2 x i64>
57 define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
58 ; X32-LABEL: pmovsxwd_1:
59 ; X32: ## BB#0: ## %entry
60 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
61 ; X32-NEXT: pmovsxwd (%eax), %xmm0
64 ; X64-LABEL: pmovsxwd_1:
65 ; X64: ## BB#0: ## %entry
66 ; X64-NEXT: pmovsxwd (%rdi), %xmm0
69 %0 = load i64, i64* %p ; <i64> [#uses=1]
70 %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1]
71 %1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1]
72 %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1]
73 %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1]
77 define <2 x i64> @pmovzxbq_1() nounwind {
78 ; X32-LABEL: pmovzxbq_1:
79 ; X32: ## BB#0: ## %entry
80 ; X32-NEXT: movl L_g16$non_lazy_ptr, %eax
81 ; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
84 ; X64-LABEL: pmovzxbq_1:
85 ; X64: ## BB#0: ## %entry
86 ; X64-NEXT: movq _g16@{{.*}}(%rip), %rax
87 ; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
90 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1]
91 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
92 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
93 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
97 declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
98 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
99 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
101 define i32 @extractps_1(<4 x float> %v) nounwind {
102 ; X32-LABEL: extractps_1:
104 ; X32-NEXT: extractps $3, %xmm0, %eax
107 ; X64-LABEL: extractps_1:
109 ; X64-NEXT: extractps $3, %xmm0, %eax
111 %s = extractelement <4 x float> %v, i32 3
112 %i = bitcast float %s to i32
115 define i32 @extractps_2(<4 x float> %v) nounwind {
116 ; X32-LABEL: extractps_2:
118 ; X32-NEXT: extractps $3, %xmm0, %eax
121 ; X64-LABEL: extractps_2:
123 ; X64-NEXT: extractps $3, %xmm0, %eax
125 %t = bitcast <4 x float> %v to <4 x i32>
126 %s = extractelement <4 x i32> %t, i32 3
131 ; The non-store form of extractps puts its result into a GPR.
132 ; This makes it suitable for an extract from a <4 x float> that
133 ; is bitcasted to i32, but unsuitable for much of anything else.
135 define float @ext_1(<4 x float> %v) nounwind {
138 ; X32-NEXT: pushl %eax
139 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
140 ; X32-NEXT: addss LCPI7_0, %xmm0
141 ; X32-NEXT: movss %xmm0, (%esp)
142 ; X32-NEXT: flds (%esp)
143 ; X32-NEXT: popl %eax
148 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
149 ; X64-NEXT: addss {{.*}}(%rip), %xmm0
151 %s = extractelement <4 x float> %v, i32 3
152 %t = fadd float %s, 1.0
155 define float @ext_2(<4 x float> %v) nounwind {
158 ; X32-NEXT: pushl %eax
159 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
160 ; X32-NEXT: movss %xmm0, (%esp)
161 ; X32-NEXT: flds (%esp)
162 ; X32-NEXT: popl %eax
167 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
169 %s = extractelement <4 x float> %v, i32 3
172 define i32 @ext_3(<4 x i32> %v) nounwind {
175 ; X32-NEXT: pextrd $3, %xmm0, %eax
180 ; X64-NEXT: pextrd $3, %xmm0, %eax
182 %i = extractelement <4 x i32> %v, i32 3
186 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
187 ; X32-LABEL: insertps_1:
189 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
192 ; X64-LABEL: insertps_1:
194 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
196 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
197 ret <4 x float> %tmp1
200 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
202 ; When optimizing for speed, prefer blendps over insertps even if it means we have to
203 ; generate a separate movss to load the scalar operand.
204 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
205 ; X32-LABEL: blendps_not_insertps_1:
207 ; X32-NEXT: movss {{.*#+}} xmm1
208 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
211 ; X64-LABEL: blendps_not_insertps_1:
213 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
215 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
216 ret <4 x float> %tmp1
219 ; When optimizing for size, generate an insertps if there's a load fold opportunity.
220 ; The difference between i386 and x86-64 ABIs for the float operand means we should
221 ; generate an insertps for X32 but not for X64!
222 define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
223 ; X32-LABEL: insertps_or_blendps:
225 ; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
228 ; X64-LABEL: insertps_or_blendps:
230 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
232 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
233 ret <4 x float> %tmp1
236 ; An insert into the low 32-bits of a vector from the low 32-bits of another vector
237 ; is always just a blendps because blendps is never more expensive than insertps.
238 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
239 ; X32-LABEL: blendps_not_insertps_2:
241 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
244 ; X64-LABEL: blendps_not_insertps_2:
246 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
248 %tmp2 = extractelement <4 x float> %t2, i32 0
249 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
250 ret <4 x float> %tmp1
253 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
254 ; X32-LABEL: ptestz_1:
256 ; X32-NEXT: ptest %xmm1, %xmm0
258 ; X32-NEXT: movzbl %al, %eax
261 ; X64-LABEL: ptestz_1:
263 ; X64-NEXT: ptest %xmm1, %xmm0
265 ; X64-NEXT: movzbl %al, %eax
267 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
271 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
272 ; X32-LABEL: ptestz_2:
274 ; X32-NEXT: ptest %xmm1, %xmm0
275 ; X32-NEXT: sbbl %eax, %eax
276 ; X32-NEXT: andl $1, %eax
279 ; X64-LABEL: ptestz_2:
281 ; X64-NEXT: ptest %xmm1, %xmm0
282 ; X64-NEXT: sbbl %eax, %eax
283 ; X64-NEXT: andl $1, %eax
285 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
289 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
290 ; X32-LABEL: ptestz_3:
292 ; X32-NEXT: ptest %xmm1, %xmm0
294 ; X32-NEXT: movzbl %al, %eax
297 ; X64-LABEL: ptestz_3:
299 ; X64-NEXT: ptest %xmm1, %xmm0
301 ; X64-NEXT: movzbl %al, %eax
303 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
308 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
309 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
310 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
312 ; This used to compile to insertps $0 + insertps $16. insertps $0 is always
314 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
315 ; X32-LABEL: buildvector:
316 ; X32: ## BB#0: ## %entry
317 ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
318 ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
319 ; X32-NEXT: addss %xmm1, %xmm0
320 ; X32-NEXT: addss %xmm2, %xmm3
321 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
324 ; X64-LABEL: buildvector:
325 ; X64: ## BB#0: ## %entry
326 ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
327 ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
328 ; X64-NEXT: addss %xmm1, %xmm0
329 ; X64-NEXT: addss %xmm2, %xmm3
330 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
333 %tmp7 = extractelement <2 x float> %A, i32 0
334 %tmp5 = extractelement <2 x float> %A, i32 1
335 %tmp3 = extractelement <2 x float> %B, i32 0
336 %tmp1 = extractelement <2 x float> %B, i32 1
337 %add.r = fadd float %tmp7, %tmp3
338 %add.i = fadd float %tmp5, %tmp1
339 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
340 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
341 ret <2 x float> %tmp9
344 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
345 ; X32-LABEL: insertps_from_shufflevector_1:
346 ; X32: ## BB#0: ## %entry
347 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
348 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
351 ; X64-LABEL: insertps_from_shufflevector_1:
352 ; X64: ## BB#0: ## %entry
353 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
356 %0 = load <4 x float>, <4 x float>* %pb, align 16
357 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
358 ret <4 x float> %vecinit6
361 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
362 ; X32-LABEL: insertps_from_shufflevector_2:
363 ; X32: ## BB#0: ## %entry
364 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
367 ; X64-LABEL: insertps_from_shufflevector_2:
368 ; X64: ## BB#0: ## %entry
369 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
372 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
373 ret <4 x float> %vecinit6
376 ; For loading an i32 from memory into an xmm register we use pinsrd
377 ; instead of insertps
378 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
379 ; X32-LABEL: pinsrd_from_shufflevector_i32:
380 ; X32: ## BB#0: ## %entry
381 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
382 ; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
383 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
386 ; X64-LABEL: pinsrd_from_shufflevector_i32:
387 ; X64: ## BB#0: ## %entry
388 ; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
389 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
392 %0 = load <4 x i32>, <4 x i32>* %pb, align 16
393 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
394 ret <4 x i32> %vecinit6
397 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
398 ; X32-LABEL: insertps_from_shufflevector_i32_2:
399 ; X32: ## BB#0: ## %entry
400 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
401 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
404 ; X64-LABEL: insertps_from_shufflevector_i32_2:
405 ; X64: ## BB#0: ## %entry
406 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
407 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
410 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
411 ret <4 x i32> %vecinit6
414 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
415 ; X32-LABEL: insertps_from_load_ins_elt_undef:
417 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
418 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
421 ; X64-LABEL: insertps_from_load_ins_elt_undef:
423 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
425 %1 = load float, float* %b, align 4
426 %2 = insertelement <4 x float> undef, float %1, i32 0
427 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
428 ret <4 x float> %result
431 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
432 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
433 ; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
435 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
436 ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
437 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
438 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
441 ; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
443 ; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
444 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
445 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
447 %1 = load i32, i32* %b, align 4
448 %2 = insertelement <4 x i32> undef, i32 %1, i32 0
449 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
450 ret <4 x i32> %result
453 ;;;;;; Shuffles optimizable with a single insertps or blend instruction
454 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
455 ; X32-LABEL: shuf_XYZ0:
457 ; X32-NEXT: xorps %xmm1, %xmm1
458 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
461 ; X64-LABEL: shuf_XYZ0:
463 ; X64-NEXT: xorps %xmm1, %xmm1
464 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
466 %vecext = extractelement <4 x float> %x, i32 0
467 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
468 %vecext1 = extractelement <4 x float> %x, i32 1
469 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
470 %vecext3 = extractelement <4 x float> %x, i32 2
471 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
472 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
473 ret <4 x float> %vecinit5
476 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
477 ; X32-LABEL: shuf_XY00:
479 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
482 ; X64-LABEL: shuf_XY00:
484 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
486 %vecext = extractelement <4 x float> %x, i32 0
487 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
488 %vecext1 = extractelement <4 x float> %x, i32 1
489 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
490 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
491 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
492 ret <4 x float> %vecinit4
495 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
496 ; X32-LABEL: shuf_XYY0:
498 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
501 ; X64-LABEL: shuf_XYY0:
503 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
505 %vecext = extractelement <4 x float> %x, i32 0
506 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
507 %vecext1 = extractelement <4 x float> %x, i32 1
508 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
509 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
510 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
511 ret <4 x float> %vecinit5
514 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
515 ; X32-LABEL: shuf_XYW0:
517 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
520 ; X64-LABEL: shuf_XYW0:
522 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
524 %vecext = extractelement <4 x float> %x, i32 0
525 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
526 %vecext1 = extractelement <4 x float> %x, i32 1
527 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
528 %vecext2 = extractelement <4 x float> %x, i32 3
529 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
530 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
531 ret <4 x float> %vecinit4
534 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
535 ; X32-LABEL: shuf_W00W:
537 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
540 ; X64-LABEL: shuf_W00W:
542 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
544 %vecext = extractelement <4 x float> %x, i32 3
545 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
546 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
547 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
548 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
549 ret <4 x float> %vecinit4
552 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
553 ; X32-LABEL: shuf_X00A:
555 ; X32-NEXT: xorps %xmm2, %xmm2
556 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
557 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
560 ; X64-LABEL: shuf_X00A:
562 ; X64-NEXT: xorps %xmm2, %xmm2
563 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
564 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
566 %vecext = extractelement <4 x float> %x, i32 0
567 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
568 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
569 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
570 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
571 ret <4 x float> %vecinit4
574 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
575 ; X32-LABEL: shuf_X00X:
577 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
580 ; X64-LABEL: shuf_X00X:
582 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
584 %vecext = extractelement <4 x float> %x, i32 0
585 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
586 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
587 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
588 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
589 ret <4 x float> %vecinit4
592 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
593 ; X32-LABEL: shuf_X0YC:
595 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
596 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
599 ; X64-LABEL: shuf_X0YC:
601 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
602 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
604 %vecext = extractelement <4 x float> %x, i32 0
605 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
606 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
607 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
608 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
609 ret <4 x float> %vecinit5
612 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
613 ; X32-LABEL: i32_shuf_XYZ0:
615 ; X32-NEXT: pxor %xmm1, %xmm1
616 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
619 ; X64-LABEL: i32_shuf_XYZ0:
621 ; X64-NEXT: pxor %xmm1, %xmm1
622 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
624 %vecext = extractelement <4 x i32> %x, i32 0
625 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
626 %vecext1 = extractelement <4 x i32> %x, i32 1
627 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
628 %vecext3 = extractelement <4 x i32> %x, i32 2
629 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
630 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
631 ret <4 x i32> %vecinit5
634 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
635 ; X32-LABEL: i32_shuf_XY00:
637 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
640 ; X64-LABEL: i32_shuf_XY00:
642 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
644 %vecext = extractelement <4 x i32> %x, i32 0
645 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
646 %vecext1 = extractelement <4 x i32> %x, i32 1
647 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
648 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
649 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
650 ret <4 x i32> %vecinit4
653 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
654 ; X32-LABEL: i32_shuf_XYY0:
656 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
657 ; X32-NEXT: pxor %xmm0, %xmm0
658 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
661 ; X64-LABEL: i32_shuf_XYY0:
663 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
664 ; X64-NEXT: pxor %xmm0, %xmm0
665 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
667 %vecext = extractelement <4 x i32> %x, i32 0
668 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
669 %vecext1 = extractelement <4 x i32> %x, i32 1
670 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
671 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
672 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
673 ret <4 x i32> %vecinit5
676 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
677 ; X32-LABEL: i32_shuf_XYW0:
679 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
680 ; X32-NEXT: pxor %xmm0, %xmm0
681 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
684 ; X64-LABEL: i32_shuf_XYW0:
686 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
687 ; X64-NEXT: pxor %xmm0, %xmm0
688 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
690 %vecext = extractelement <4 x i32> %x, i32 0
691 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
692 %vecext1 = extractelement <4 x i32> %x, i32 1
693 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
694 %vecext2 = extractelement <4 x i32> %x, i32 3
695 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
696 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
697 ret <4 x i32> %vecinit4
700 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
701 ; X32-LABEL: i32_shuf_W00W:
703 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
704 ; X32-NEXT: pxor %xmm0, %xmm0
705 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
708 ; X64-LABEL: i32_shuf_W00W:
710 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
711 ; X64-NEXT: pxor %xmm0, %xmm0
712 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
714 %vecext = extractelement <4 x i32> %x, i32 3
715 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
716 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
717 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
718 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
719 ret <4 x i32> %vecinit4
722 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
723 ; X32-LABEL: i32_shuf_X00A:
725 ; X32-NEXT: pxor %xmm2, %xmm2
726 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
727 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
728 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
731 ; X64-LABEL: i32_shuf_X00A:
733 ; X64-NEXT: pxor %xmm2, %xmm2
734 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
735 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
736 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
738 %vecext = extractelement <4 x i32> %x, i32 0
739 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
740 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
741 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
742 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
743 ret <4 x i32> %vecinit4
746 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
747 ; X32-LABEL: i32_shuf_X00X:
749 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
750 ; X32-NEXT: pxor %xmm0, %xmm0
751 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
754 ; X64-LABEL: i32_shuf_X00X:
756 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
757 ; X64-NEXT: pxor %xmm0, %xmm0
758 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
760 %vecext = extractelement <4 x i32> %x, i32 0
761 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
762 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
763 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
764 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
765 ret <4 x i32> %vecinit4
768 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
769 ; X32-LABEL: i32_shuf_X0YC:
771 ; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
772 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
773 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
776 ; X64-LABEL: i32_shuf_X0YC:
778 ; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
779 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
780 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
782 %vecext = extractelement <4 x i32> %x, i32 0
783 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
784 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
785 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
786 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
787 ret <4 x i32> %vecinit5
790 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
791 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
792 ; X32-LABEL: test_insertps_no_undef:
794 ; X32-NEXT: xorps %xmm1, %xmm1
795 ; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
796 ; X32-NEXT: maxps %xmm1, %xmm0
799 ; X64-LABEL: test_insertps_no_undef:
801 ; X64-NEXT: xorps %xmm1, %xmm1
802 ; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
803 ; X64-NEXT: maxps %xmm1, %xmm0
805 %vecext = extractelement <4 x float> %x, i32 0
806 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
807 %vecext1 = extractelement <4 x float> %x, i32 1
808 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
809 %vecext3 = extractelement <4 x float> %x, i32 2
810 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
811 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
812 %mask = fcmp olt <4 x float> %vecinit5, %x
813 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
817 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
818 ; X32-LABEL: blendvb_fallback:
820 ; X32-NEXT: psllw $15, %xmm0
821 ; X32-NEXT: psraw $15, %xmm0
822 ; X32-NEXT: pblendvb %xmm1, %xmm2
823 ; X32-NEXT: movdqa %xmm2, %xmm0
826 ; X64-LABEL: blendvb_fallback:
828 ; X64-NEXT: psllw $15, %xmm0
829 ; X64-NEXT: psraw $15, %xmm0
830 ; X64-NEXT: pblendvb %xmm1, %xmm2
831 ; X64-NEXT: movdqa %xmm2, %xmm0
833 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
837 ; On X32, account for the argument's move to registers
838 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
839 ; X32-LABEL: insertps_from_vector_load:
841 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
842 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
845 ; X64-LABEL: insertps_from_vector_load:
847 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
849 %1 = load <4 x float>, <4 x float>* %pb, align 16
850 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
854 ;; Use a non-zero CountS for insertps
855 ;; Try to match a bit more of the instr, since we need the load's offset.
856 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
857 ; X32-LABEL: insertps_from_vector_load_offset:
859 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
860 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
863 ; X64-LABEL: insertps_from_vector_load_offset:
865 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
867 %1 = load <4 x float>, <4 x float>* %pb, align 16
868 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
872 ;; Try to match a bit more of the instr, since we need the load's offset.
873 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
874 ; X32-LABEL: insertps_from_vector_load_offset_2:
876 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
877 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
878 ; X32-NEXT: shll $4, %ecx
879 ; X32-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
882 ; X64-LABEL: insertps_from_vector_load_offset_2:
884 ; X64-NEXT: shlq $4, %rsi
885 ; X64-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
887 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
888 %2 = load <4 x float>, <4 x float>* %1, align 16
889 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
893 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
894 ; X32-LABEL: insertps_from_broadcast_loadf32:
896 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
897 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
898 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
899 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
900 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
903 ; X64-LABEL: insertps_from_broadcast_loadf32:
905 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
906 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
907 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
909 %1 = getelementptr inbounds float, float* %fb, i64 %index
910 %2 = load float, float* %1, align 4
911 %3 = insertelement <4 x float> undef, float %2, i32 0
912 %4 = insertelement <4 x float> %3, float %2, i32 1
913 %5 = insertelement <4 x float> %4, float %2, i32 2
914 %6 = insertelement <4 x float> %5, float %2, i32 3
915 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
919 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
920 ; X32-LABEL: insertps_from_broadcast_loadv4f32:
922 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
923 ; X32-NEXT: movups (%eax), %xmm1
924 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
925 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
928 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
930 ; X64-NEXT: movups (%rdi), %xmm1
931 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
932 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
934 %1 = load <4 x float>, <4 x float>* %b, align 4
935 %2 = extractelement <4 x float> %1, i32 0
936 %3 = insertelement <4 x float> undef, float %2, i32 0
937 %4 = insertelement <4 x float> %3, float %2, i32 1
938 %5 = insertelement <4 x float> %4, float %2, i32 2
939 %6 = insertelement <4 x float> %5, float %2, i32 3
940 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
944 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
945 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
946 ; X32-LABEL: insertps_from_broadcast_multiple_use:
948 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
949 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
950 ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
951 ; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
952 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
953 ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
954 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
955 ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
956 ; X32-NEXT: addps %xmm1, %xmm0
957 ; X32-NEXT: addps %xmm2, %xmm3
958 ; X32-NEXT: addps %xmm3, %xmm0
961 ; X64-LABEL: insertps_from_broadcast_multiple_use:
963 ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
964 ; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
965 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
966 ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
967 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
968 ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
969 ; X64-NEXT: addps %xmm1, %xmm0
970 ; X64-NEXT: addps %xmm2, %xmm3
971 ; X64-NEXT: addps %xmm3, %xmm0
973 %1 = getelementptr inbounds float, float* %fb, i64 %index
974 %2 = load float, float* %1, align 4
975 %3 = insertelement <4 x float> undef, float %2, i32 0
976 %4 = insertelement <4 x float> %3, float %2, i32 1
977 %5 = insertelement <4 x float> %4, float %2, i32 2
978 %6 = insertelement <4 x float> %5, float %2, i32 3
979 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
980 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
981 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
982 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
983 %11 = fadd <4 x float> %7, %8
984 %12 = fadd <4 x float> %9, %10
985 %13 = fadd <4 x float> %11, %12
989 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
990 ; X32-LABEL: insertps_with_undefs:
992 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
993 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
994 ; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
995 ; X32-NEXT: movapd %xmm1, %xmm0
998 ; X64-LABEL: insertps_with_undefs:
1000 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1001 ; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1002 ; X64-NEXT: movapd %xmm1, %xmm0
1004 %1 = load float, float* %b, align 4
1005 %2 = insertelement <4 x float> undef, float %1, i32 0
1006 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
1007 ret <4 x float> %result
1010 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
1011 ; the destination index to change the load, instead of the source index.
1012 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
1013 ; X32-LABEL: pr20087:
1015 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1016 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
1019 ; X64-LABEL: pr20087:
1021 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
1023 %load = load <4 x float> , <4 x float> *%ptr
1024 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
1025 ret <4 x float> %ret
1028 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
1029 define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
1030 ; X32-LABEL: insertps_pr20411:
1032 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1033 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1034 ; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1035 ; X32-NEXT: movdqu %xmm1, (%eax)
1038 ; X64-LABEL: insertps_pr20411:
\r
1040 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
\r
1041 ; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
\r
1042 ; X64-NEXT: movdqu %xmm1, (%rdi)
\r
1044 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
\r
1045 %ptrcast = bitcast i32* %RET to <4 x i32>*
\r
1046 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
\r
1050 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
1051 ; X32-LABEL: insertps_4:
1052 ; X32: ## BB#0: ## %entry
1053 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1056 ; X64-LABEL: insertps_4:
1057 ; X64: ## BB#0: ## %entry
1058 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1061 %vecext = extractelement <4 x float> %A, i32 0
1062 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1063 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1064 %vecext2 = extractelement <4 x float> %B, i32 2
1065 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1066 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1067 ret <4 x float> %vecinit4
1070 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1071 ; X32-LABEL: insertps_5:
1072 ; X32: ## BB#0: ## %entry
1073 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1076 ; X64-LABEL: insertps_5:
1077 ; X64: ## BB#0: ## %entry
1078 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1081 %vecext = extractelement <4 x float> %A, i32 0
1082 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1083 %vecext1 = extractelement <4 x float> %B, i32 1
1084 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1085 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1086 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1087 ret <4 x float> %vecinit4
1090 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1091 ; X32-LABEL: insertps_6:
1092 ; X32: ## BB#0: ## %entry
1093 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1096 ; X64-LABEL: insertps_6:
1097 ; X64: ## BB#0: ## %entry
1098 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1101 %vecext = extractelement <4 x float> %A, i32 1
1102 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1103 %vecext1 = extractelement <4 x float> %B, i32 2
1104 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1105 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1106 ret <4 x float> %vecinit3
1109 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
1110 ; X32-LABEL: insertps_7:
1111 ; X32: ## BB#0: ## %entry
1112 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1115 ; X64-LABEL: insertps_7:
1116 ; X64: ## BB#0: ## %entry
1117 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1120 %vecext = extractelement <4 x float> %A, i32 0
1121 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1122 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1123 %vecext2 = extractelement <4 x float> %B, i32 1
1124 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1125 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1126 ret <4 x float> %vecinit4
1129 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
1130 ; X32-LABEL: insertps_8:
1131 ; X32: ## BB#0: ## %entry
1132 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1135 ; X64-LABEL: insertps_8:
1136 ; X64: ## BB#0: ## %entry
1137 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1140 %vecext = extractelement <4 x float> %A, i32 0
1141 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1142 %vecext1 = extractelement <4 x float> %B, i32 0
1143 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1144 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1145 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1146 ret <4 x float> %vecinit4
1149 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
1150 ; X32-LABEL: insertps_9:
1151 ; X32: ## BB#0: ## %entry
1152 ; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1153 ; X32-NEXT: movaps %xmm1, %xmm0
1156 ; X64-LABEL: insertps_9:
1157 ; X64: ## BB#0: ## %entry
1158 ; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1159 ; X64-NEXT: movaps %xmm1, %xmm0
1162 %vecext = extractelement <4 x float> %A, i32 0
1163 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1164 %vecext1 = extractelement <4 x float> %B, i32 2
1165 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1166 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1167 ret <4 x float> %vecinit3
1170 define <4 x float> @insertps_10(<4 x float> %A)
1171 ; X32-LABEL: insertps_10:
1173 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1176 ; X64-LABEL: insertps_10:
1178 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1181 %vecext = extractelement <4 x float> %A, i32 0
1182 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
1183 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
1184 ret <4 x float> %vecbuild2
1187 define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
1188 ; X32-LABEL: build_vector_to_shuffle_1:
1189 ; X32: ## BB#0: ## %entry
1190 ; X32-NEXT: xorps %xmm1, %xmm1
1191 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1194 ; X64-LABEL: build_vector_to_shuffle_1:
1195 ; X64: ## BB#0: ## %entry
1196 ; X64-NEXT: xorps %xmm1, %xmm1
1197 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1200 %vecext = extractelement <4 x float> %A, i32 1
1201 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1202 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1203 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1204 ret <4 x float> %vecinit3
1207 define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
1208 ; X32-LABEL: build_vector_to_shuffle_2:
1209 ; X32: ## BB#0: ## %entry
1210 ; X32-NEXT: xorps %xmm1, %xmm1
1211 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1214 ; X64-LABEL: build_vector_to_shuffle_2:
1215 ; X64: ## BB#0: ## %entry
1216 ; X64-NEXT: xorps %xmm1, %xmm1
1217 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1220 %vecext = extractelement <4 x float> %A, i32 1
1221 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1222 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1223 ret <4 x float> %vecinit1