1 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
4 @g16 = external global i16
6 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
9 ; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
12 ; X64-LABEL: pinsrd_1:
14 ; X64-NEXT: pinsrd $1, %edi, %xmm0
16 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
20 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
21 ; X32-LABEL: pinsrb_1:
23 ; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
26 ; X64-LABEL: pinsrb_1:
28 ; X64-NEXT: pinsrb $1, %edi, %xmm0
30 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
34 define <2 x i64> @pmovzxbq_1() nounwind {
35 ; X32-LABEL: pmovzxbq_1:
36 ; X32: ## BB#0: ## %entry
37 ; X32-NEXT: movl L_g16$non_lazy_ptr, %eax
38 ; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
41 ; X64-LABEL: pmovzxbq_1:
42 ; X64: ## BB#0: ## %entry
43 ; X64-NEXT: movq _g16@{{.*}}(%rip), %rax
44 ; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
47 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1]
48 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
49 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
50 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
54 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
56 define i32 @extractps_1(<4 x float> %v) nounwind {
57 ; X32-LABEL: extractps_1:
59 ; X32-NEXT: extractps $3, %xmm0, %eax
62 ; X64-LABEL: extractps_1:
64 ; X64-NEXT: extractps $3, %xmm0, %eax
66 %s = extractelement <4 x float> %v, i32 3
67 %i = bitcast float %s to i32
70 define i32 @extractps_2(<4 x float> %v) nounwind {
71 ; X32-LABEL: extractps_2:
73 ; X32-NEXT: extractps $3, %xmm0, %eax
76 ; X64-LABEL: extractps_2:
78 ; X64-NEXT: extractps $3, %xmm0, %eax
80 %t = bitcast <4 x float> %v to <4 x i32>
81 %s = extractelement <4 x i32> %t, i32 3
86 ; The non-store form of extractps puts its result into a GPR.
87 ; This makes it suitable for an extract from a <4 x float> that
88 ; is bitcasted to i32, but unsuitable for much of anything else.
90 define float @ext_1(<4 x float> %v) nounwind {
93 ; X32-NEXT: pushl %eax
94 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
95 ; X32-NEXT: addss LCPI5_0, %xmm0
96 ; X32-NEXT: movss %xmm0, (%esp)
97 ; X32-NEXT: flds (%esp)
103 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
104 ; X64-NEXT: addss {{.*}}(%rip), %xmm0
106 %s = extractelement <4 x float> %v, i32 3
107 %t = fadd float %s, 1.0
110 define float @ext_2(<4 x float> %v) nounwind {
113 ; X32-NEXT: pushl %eax
114 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
115 ; X32-NEXT: movss %xmm0, (%esp)
116 ; X32-NEXT: flds (%esp)
117 ; X32-NEXT: popl %eax
122 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
124 %s = extractelement <4 x float> %v, i32 3
127 define i32 @ext_3(<4 x i32> %v) nounwind {
130 ; X32-NEXT: pextrd $3, %xmm0, %eax
135 ; X64-NEXT: pextrd $3, %xmm0, %eax
137 %i = extractelement <4 x i32> %v, i32 3
141 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
142 ; X32-LABEL: insertps_1:
144 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
147 ; X64-LABEL: insertps_1:
149 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
151 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
152 ret <4 x float> %tmp1
155 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
157 ; When optimizing for speed, prefer blendps over insertps even if it means we have to
158 ; generate a separate movss to load the scalar operand.
159 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
160 ; X32-LABEL: blendps_not_insertps_1:
162 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
163 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
166 ; X64-LABEL: blendps_not_insertps_1:
168 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
170 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
171 ret <4 x float> %tmp1
174 ; When optimizing for size, generate an insertps if there's a load fold opportunity.
175 ; The difference between i386 and x86-64 ABIs for the float operand means we should
176 ; generate an insertps for X32 but not for X64!
177 define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
178 ; X32-LABEL: insertps_or_blendps:
180 ; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
183 ; X64-LABEL: insertps_or_blendps:
185 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
187 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
188 ret <4 x float> %tmp1
191 ; An insert into the low 32-bits of a vector from the low 32-bits of another vector
192 ; is always just a blendps because blendps is never more expensive than insertps.
193 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
194 ; X32-LABEL: blendps_not_insertps_2:
196 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
199 ; X64-LABEL: blendps_not_insertps_2:
201 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
203 %tmp2 = extractelement <4 x float> %t2, i32 0
204 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
205 ret <4 x float> %tmp1
208 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
209 ; X32-LABEL: ptestz_1:
211 ; X32-NEXT: ptest %xmm1, %xmm0
213 ; X32-NEXT: movzbl %al, %eax
216 ; X64-LABEL: ptestz_1:
218 ; X64-NEXT: ptest %xmm1, %xmm0
220 ; X64-NEXT: movzbl %al, %eax
222 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
226 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
227 ; X32-LABEL: ptestz_2:
229 ; X32-NEXT: ptest %xmm1, %xmm0
230 ; X32-NEXT: sbbl %eax, %eax
231 ; X32-NEXT: andl $1, %eax
234 ; X64-LABEL: ptestz_2:
236 ; X64-NEXT: ptest %xmm1, %xmm0
237 ; X64-NEXT: sbbl %eax, %eax
238 ; X64-NEXT: andl $1, %eax
240 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
244 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
245 ; X32-LABEL: ptestz_3:
247 ; X32-NEXT: ptest %xmm1, %xmm0
249 ; X32-NEXT: movzbl %al, %eax
252 ; X64-LABEL: ptestz_3:
254 ; X64-NEXT: ptest %xmm1, %xmm0
256 ; X64-NEXT: movzbl %al, %eax
258 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
263 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
264 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
265 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
267 ; This used to compile to insertps $0 + insertps $16. insertps $0 is always
269 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
270 ; X32-LABEL: buildvector:
271 ; X32: ## BB#0: ## %entry
272 ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
273 ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
274 ; X32-NEXT: addss %xmm1, %xmm0
275 ; X32-NEXT: addss %xmm2, %xmm3
276 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
279 ; X64-LABEL: buildvector:
280 ; X64: ## BB#0: ## %entry
281 ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
282 ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
283 ; X64-NEXT: addss %xmm1, %xmm0
284 ; X64-NEXT: addss %xmm2, %xmm3
285 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
288 %tmp7 = extractelement <2 x float> %A, i32 0
289 %tmp5 = extractelement <2 x float> %A, i32 1
290 %tmp3 = extractelement <2 x float> %B, i32 0
291 %tmp1 = extractelement <2 x float> %B, i32 1
292 %add.r = fadd float %tmp7, %tmp3
293 %add.i = fadd float %tmp5, %tmp1
294 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
295 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
296 ret <2 x float> %tmp9
299 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
300 ; X32-LABEL: insertps_from_shufflevector_1:
301 ; X32: ## BB#0: ## %entry
302 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
303 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
306 ; X64-LABEL: insertps_from_shufflevector_1:
307 ; X64: ## BB#0: ## %entry
308 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
311 %0 = load <4 x float>, <4 x float>* %pb, align 16
312 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
313 ret <4 x float> %vecinit6
316 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
317 ; X32-LABEL: insertps_from_shufflevector_2:
318 ; X32: ## BB#0: ## %entry
319 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
322 ; X64-LABEL: insertps_from_shufflevector_2:
323 ; X64: ## BB#0: ## %entry
324 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
327 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
328 ret <4 x float> %vecinit6
331 ; For loading an i32 from memory into an xmm register we use pinsrd
332 ; instead of insertps
333 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
334 ; X32-LABEL: pinsrd_from_shufflevector_i32:
335 ; X32: ## BB#0: ## %entry
336 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
337 ; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
338 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
341 ; X64-LABEL: pinsrd_from_shufflevector_i32:
342 ; X64: ## BB#0: ## %entry
343 ; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
344 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
347 %0 = load <4 x i32>, <4 x i32>* %pb, align 16
348 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
349 ret <4 x i32> %vecinit6
352 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
353 ; X32-LABEL: insertps_from_shufflevector_i32_2:
354 ; X32: ## BB#0: ## %entry
355 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
356 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
359 ; X64-LABEL: insertps_from_shufflevector_i32_2:
360 ; X64: ## BB#0: ## %entry
361 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
362 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
365 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
366 ret <4 x i32> %vecinit6
369 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
370 ; X32-LABEL: insertps_from_load_ins_elt_undef:
372 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
376 ; X64-LABEL: insertps_from_load_ins_elt_undef:
378 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
380 %1 = load float, float* %b, align 4
381 %2 = insertelement <4 x float> undef, float %1, i32 0
382 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
383 ret <4 x float> %result
386 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
387 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
388 ; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
390 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
391 ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
392 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
393 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
396 ; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
398 ; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
399 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
400 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
402 %1 = load i32, i32* %b, align 4
403 %2 = insertelement <4 x i32> undef, i32 %1, i32 0
404 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
405 ret <4 x i32> %result
408 ;;;;;; Shuffles optimizable with a single insertps or blend instruction
409 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
410 ; X32-LABEL: shuf_XYZ0:
412 ; X32-NEXT: xorps %xmm1, %xmm1
413 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
416 ; X64-LABEL: shuf_XYZ0:
418 ; X64-NEXT: xorps %xmm1, %xmm1
419 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
421 %vecext = extractelement <4 x float> %x, i32 0
422 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
423 %vecext1 = extractelement <4 x float> %x, i32 1
424 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
425 %vecext3 = extractelement <4 x float> %x, i32 2
426 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
427 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
428 ret <4 x float> %vecinit5
431 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
432 ; X32-LABEL: shuf_XY00:
434 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
437 ; X64-LABEL: shuf_XY00:
439 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
441 %vecext = extractelement <4 x float> %x, i32 0
442 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
443 %vecext1 = extractelement <4 x float> %x, i32 1
444 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
445 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
446 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
447 ret <4 x float> %vecinit4
450 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
451 ; X32-LABEL: shuf_XYY0:
453 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
456 ; X64-LABEL: shuf_XYY0:
458 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
460 %vecext = extractelement <4 x float> %x, i32 0
461 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
462 %vecext1 = extractelement <4 x float> %x, i32 1
463 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
464 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
465 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
466 ret <4 x float> %vecinit5
469 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
470 ; X32-LABEL: shuf_XYW0:
472 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
475 ; X64-LABEL: shuf_XYW0:
477 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
479 %vecext = extractelement <4 x float> %x, i32 0
480 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
481 %vecext1 = extractelement <4 x float> %x, i32 1
482 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
483 %vecext2 = extractelement <4 x float> %x, i32 3
484 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
485 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
486 ret <4 x float> %vecinit4
489 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
490 ; X32-LABEL: shuf_W00W:
492 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
495 ; X64-LABEL: shuf_W00W:
497 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
499 %vecext = extractelement <4 x float> %x, i32 3
500 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
501 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
502 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
503 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
504 ret <4 x float> %vecinit4
507 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
508 ; X32-LABEL: shuf_X00A:
510 ; X32-NEXT: xorps %xmm2, %xmm2
511 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
512 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
515 ; X64-LABEL: shuf_X00A:
517 ; X64-NEXT: xorps %xmm2, %xmm2
518 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
519 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
521 %vecext = extractelement <4 x float> %x, i32 0
522 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
523 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
524 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
525 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
526 ret <4 x float> %vecinit4
529 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
530 ; X32-LABEL: shuf_X00X:
532 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
535 ; X64-LABEL: shuf_X00X:
537 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
539 %vecext = extractelement <4 x float> %x, i32 0
540 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
541 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
542 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
543 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
544 ret <4 x float> %vecinit4
547 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
548 ; X32-LABEL: shuf_X0YC:
550 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
551 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
554 ; X64-LABEL: shuf_X0YC:
556 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
557 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
559 %vecext = extractelement <4 x float> %x, i32 0
560 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
561 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
562 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
563 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
564 ret <4 x float> %vecinit5
567 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
568 ; X32-LABEL: i32_shuf_XYZ0:
570 ; X32-NEXT: pxor %xmm1, %xmm1
571 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
574 ; X64-LABEL: i32_shuf_XYZ0:
576 ; X64-NEXT: pxor %xmm1, %xmm1
577 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
579 %vecext = extractelement <4 x i32> %x, i32 0
580 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
581 %vecext1 = extractelement <4 x i32> %x, i32 1
582 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
583 %vecext3 = extractelement <4 x i32> %x, i32 2
584 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
585 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
586 ret <4 x i32> %vecinit5
589 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
590 ; X32-LABEL: i32_shuf_XY00:
592 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
595 ; X64-LABEL: i32_shuf_XY00:
597 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
599 %vecext = extractelement <4 x i32> %x, i32 0
600 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
601 %vecext1 = extractelement <4 x i32> %x, i32 1
602 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
603 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
604 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
605 ret <4 x i32> %vecinit4
608 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
609 ; X32-LABEL: i32_shuf_XYY0:
611 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
612 ; X32-NEXT: pxor %xmm0, %xmm0
613 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
616 ; X64-LABEL: i32_shuf_XYY0:
618 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
619 ; X64-NEXT: pxor %xmm0, %xmm0
620 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
622 %vecext = extractelement <4 x i32> %x, i32 0
623 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
624 %vecext1 = extractelement <4 x i32> %x, i32 1
625 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
626 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
627 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
628 ret <4 x i32> %vecinit5
631 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
632 ; X32-LABEL: i32_shuf_XYW0:
634 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
635 ; X32-NEXT: pxor %xmm0, %xmm0
636 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
639 ; X64-LABEL: i32_shuf_XYW0:
641 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
642 ; X64-NEXT: pxor %xmm0, %xmm0
643 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
645 %vecext = extractelement <4 x i32> %x, i32 0
646 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
647 %vecext1 = extractelement <4 x i32> %x, i32 1
648 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
649 %vecext2 = extractelement <4 x i32> %x, i32 3
650 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
651 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
652 ret <4 x i32> %vecinit4
655 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
656 ; X32-LABEL: i32_shuf_W00W:
658 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
659 ; X32-NEXT: pxor %xmm0, %xmm0
660 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
663 ; X64-LABEL: i32_shuf_W00W:
665 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
666 ; X64-NEXT: pxor %xmm0, %xmm0
667 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
669 %vecext = extractelement <4 x i32> %x, i32 3
670 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
671 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
672 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
673 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
674 ret <4 x i32> %vecinit4
677 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
678 ; X32-LABEL: i32_shuf_X00A:
680 ; X32-NEXT: pxor %xmm2, %xmm2
681 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
682 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
683 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
686 ; X64-LABEL: i32_shuf_X00A:
688 ; X64-NEXT: pxor %xmm2, %xmm2
689 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
690 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
691 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
693 %vecext = extractelement <4 x i32> %x, i32 0
694 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
695 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
696 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
697 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
698 ret <4 x i32> %vecinit4
701 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
702 ; X32-LABEL: i32_shuf_X00X:
704 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
705 ; X32-NEXT: pxor %xmm0, %xmm0
706 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
709 ; X64-LABEL: i32_shuf_X00X:
711 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
712 ; X64-NEXT: pxor %xmm0, %xmm0
713 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
715 %vecext = extractelement <4 x i32> %x, i32 0
716 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
717 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
718 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
719 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
720 ret <4 x i32> %vecinit4
723 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
724 ; X32-LABEL: i32_shuf_X0YC:
726 ; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
727 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
728 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
731 ; X64-LABEL: i32_shuf_X0YC:
733 ; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
734 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
735 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
737 %vecext = extractelement <4 x i32> %x, i32 0
738 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
739 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
740 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
741 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
742 ret <4 x i32> %vecinit5
745 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
746 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
747 ; X32-LABEL: test_insertps_no_undef:
749 ; X32-NEXT: xorps %xmm1, %xmm1
750 ; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
751 ; X32-NEXT: maxps %xmm1, %xmm0
754 ; X64-LABEL: test_insertps_no_undef:
756 ; X64-NEXT: xorps %xmm1, %xmm1
757 ; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
758 ; X64-NEXT: maxps %xmm1, %xmm0
760 %vecext = extractelement <4 x float> %x, i32 0
761 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
762 %vecext1 = extractelement <4 x float> %x, i32 1
763 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
764 %vecext3 = extractelement <4 x float> %x, i32 2
765 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
766 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
767 %mask = fcmp olt <4 x float> %vecinit5, %x
768 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
772 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
773 ; X32-LABEL: blendvb_fallback:
775 ; X32-NEXT: psllw $15, %xmm0
776 ; X32-NEXT: psraw $15, %xmm0
777 ; X32-NEXT: pblendvb %xmm1, %xmm2
778 ; X32-NEXT: movdqa %xmm2, %xmm0
781 ; X64-LABEL: blendvb_fallback:
783 ; X64-NEXT: psllw $15, %xmm0
784 ; X64-NEXT: psraw $15, %xmm0
785 ; X64-NEXT: pblendvb %xmm1, %xmm2
786 ; X64-NEXT: movdqa %xmm2, %xmm0
788 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
792 ; On X32, account for the argument's move to registers
793 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
794 ; X32-LABEL: insertps_from_vector_load:
796 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
797 ; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
800 ; X64-LABEL: insertps_from_vector_load:
802 ; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
804 %1 = load <4 x float>, <4 x float>* %pb, align 16
805 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
809 ;; Use a non-zero CountS for insertps
810 ;; Try to match a bit more of the instr, since we need the load's offset.
811 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
812 ; X32-LABEL: insertps_from_vector_load_offset:
814 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
815 ; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
818 ; X64-LABEL: insertps_from_vector_load_offset:
820 ; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
822 %1 = load <4 x float>, <4 x float>* %pb, align 16
823 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
827 ;; Try to match a bit more of the instr, since we need the load's offset.
828 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
829 ; X32-LABEL: insertps_from_vector_load_offset_2:
831 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
832 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
833 ; X32-NEXT: shll $4, %ecx
834 ; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
837 ; X64-LABEL: insertps_from_vector_load_offset_2:
839 ; X64-NEXT: shlq $4, %rsi
840 ; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
842 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
843 %2 = load <4 x float>, <4 x float>* %1, align 16
844 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
848 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
849 ; X32-LABEL: insertps_from_broadcast_loadf32:
851 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
852 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
853 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
854 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
855 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
858 ; X64-LABEL: insertps_from_broadcast_loadf32:
860 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
861 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
862 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
864 %1 = getelementptr inbounds float, float* %fb, i64 %index
865 %2 = load float, float* %1, align 4
866 %3 = insertelement <4 x float> undef, float %2, i32 0
867 %4 = insertelement <4 x float> %3, float %2, i32 1
868 %5 = insertelement <4 x float> %4, float %2, i32 2
869 %6 = insertelement <4 x float> %5, float %2, i32 3
870 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
874 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
875 ; X32-LABEL: insertps_from_broadcast_loadv4f32:
877 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
878 ; X32-NEXT: movups (%eax), %xmm1
879 ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
880 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
883 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
885 ; X64-NEXT: movups (%rdi), %xmm1
886 ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
887 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
889 %1 = load <4 x float>, <4 x float>* %b, align 4
890 %2 = extractelement <4 x float> %1, i32 0
891 %3 = insertelement <4 x float> undef, float %2, i32 0
892 %4 = insertelement <4 x float> %3, float %2, i32 1
893 %5 = insertelement <4 x float> %4, float %2, i32 2
894 %6 = insertelement <4 x float> %5, float %2, i32 3
895 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
899 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
900 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
901 ; X32-LABEL: insertps_from_broadcast_multiple_use:
903 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
904 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
905 ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
906 ; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
907 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
908 ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
909 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
910 ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
911 ; X32-NEXT: addps %xmm1, %xmm0
912 ; X32-NEXT: addps %xmm2, %xmm3
913 ; X32-NEXT: addps %xmm3, %xmm0
916 ; X64-LABEL: insertps_from_broadcast_multiple_use:
918 ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
919 ; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
920 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
921 ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
922 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
923 ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
924 ; X64-NEXT: addps %xmm1, %xmm0
925 ; X64-NEXT: addps %xmm2, %xmm3
926 ; X64-NEXT: addps %xmm3, %xmm0
928 %1 = getelementptr inbounds float, float* %fb, i64 %index
929 %2 = load float, float* %1, align 4
930 %3 = insertelement <4 x float> undef, float %2, i32 0
931 %4 = insertelement <4 x float> %3, float %2, i32 1
932 %5 = insertelement <4 x float> %4, float %2, i32 2
933 %6 = insertelement <4 x float> %5, float %2, i32 3
934 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
935 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
936 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
937 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
938 %11 = fadd <4 x float> %7, %8
939 %12 = fadd <4 x float> %9, %10
940 %13 = fadd <4 x float> %11, %12
944 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
945 ; X32-LABEL: insertps_with_undefs:
947 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
948 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
949 ; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
950 ; X32-NEXT: movapd %xmm1, %xmm0
953 ; X64-LABEL: insertps_with_undefs:
955 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
956 ; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
957 ; X64-NEXT: movapd %xmm1, %xmm0
959 %1 = load float, float* %b, align 4
960 %2 = insertelement <4 x float> undef, float %1, i32 0
961 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
962 ret <4 x float> %result
965 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
966 ; the destination index to change the load, instead of the source index.
967 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
968 ; X32-LABEL: pr20087:
970 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
971 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
974 ; X64-LABEL: pr20087:
976 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
978 %load = load <4 x float> , <4 x float> *%ptr
979 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
983 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
984 define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
985 ; X32-LABEL: insertps_pr20411:
987 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
988 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
989 ; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
990 ; X32-NEXT: movdqu %xmm1, (%eax)
993 ; X64-LABEL: insertps_pr20411:
995 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
996 ; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
997 ; X64-NEXT: movdqu %xmm1, (%rdi)
999 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
1000 %ptrcast = bitcast i32* %RET to <4 x i32>*
1001 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
1005 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
1006 ; X32-LABEL: insertps_4:
1007 ; X32: ## BB#0: ## %entry
1008 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1011 ; X64-LABEL: insertps_4:
1012 ; X64: ## BB#0: ## %entry
1013 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1016 %vecext = extractelement <4 x float> %A, i32 0
1017 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1018 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1019 %vecext2 = extractelement <4 x float> %B, i32 2
1020 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1021 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1022 ret <4 x float> %vecinit4
1025 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1026 ; X32-LABEL: insertps_5:
1027 ; X32: ## BB#0: ## %entry
1028 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1031 ; X64-LABEL: insertps_5:
1032 ; X64: ## BB#0: ## %entry
1033 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1036 %vecext = extractelement <4 x float> %A, i32 0
1037 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1038 %vecext1 = extractelement <4 x float> %B, i32 1
1039 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1040 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1041 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1042 ret <4 x float> %vecinit4
1045 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1046 ; X32-LABEL: insertps_6:
1047 ; X32: ## BB#0: ## %entry
1048 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1051 ; X64-LABEL: insertps_6:
1052 ; X64: ## BB#0: ## %entry
1053 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1056 %vecext = extractelement <4 x float> %A, i32 1
1057 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1058 %vecext1 = extractelement <4 x float> %B, i32 2
1059 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1060 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1061 ret <4 x float> %vecinit3
1064 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
1065 ; X32-LABEL: insertps_7:
1066 ; X32: ## BB#0: ## %entry
1067 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1070 ; X64-LABEL: insertps_7:
1071 ; X64: ## BB#0: ## %entry
1072 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1075 %vecext = extractelement <4 x float> %A, i32 0
1076 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1077 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1078 %vecext2 = extractelement <4 x float> %B, i32 1
1079 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1080 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1081 ret <4 x float> %vecinit4
1084 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
1085 ; X32-LABEL: insertps_8:
1086 ; X32: ## BB#0: ## %entry
1087 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1090 ; X64-LABEL: insertps_8:
1091 ; X64: ## BB#0: ## %entry
1092 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1095 %vecext = extractelement <4 x float> %A, i32 0
1096 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1097 %vecext1 = extractelement <4 x float> %B, i32 0
1098 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1099 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1100 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1101 ret <4 x float> %vecinit4
1104 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
1105 ; X32-LABEL: insertps_9:
1106 ; X32: ## BB#0: ## %entry
1107 ; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1108 ; X32-NEXT: movaps %xmm1, %xmm0
1111 ; X64-LABEL: insertps_9:
1112 ; X64: ## BB#0: ## %entry
1113 ; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1114 ; X64-NEXT: movaps %xmm1, %xmm0
1117 %vecext = extractelement <4 x float> %A, i32 0
1118 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1119 %vecext1 = extractelement <4 x float> %B, i32 2
1120 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1121 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1122 ret <4 x float> %vecinit3
1125 define <4 x float> @insertps_10(<4 x float> %A)
1126 ; X32-LABEL: insertps_10:
1128 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1131 ; X64-LABEL: insertps_10:
1133 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1136 %vecext = extractelement <4 x float> %A, i32 0
1137 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
1138 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
1139 ret <4 x float> %vecbuild2
1142 define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
1143 ; X32-LABEL: build_vector_to_shuffle_1:
1144 ; X32: ## BB#0: ## %entry
1145 ; X32-NEXT: xorps %xmm1, %xmm1
1146 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1149 ; X64-LABEL: build_vector_to_shuffle_1:
1150 ; X64: ## BB#0: ## %entry
1151 ; X64-NEXT: xorps %xmm1, %xmm1
1152 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1155 %vecext = extractelement <4 x float> %A, i32 1
1156 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1157 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1158 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1159 ret <4 x float> %vecinit3
1162 define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
1163 ; X32-LABEL: build_vector_to_shuffle_2:
1164 ; X32: ## BB#0: ## %entry
1165 ; X32-NEXT: xorps %xmm1, %xmm1
1166 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1169 ; X64-LABEL: build_vector_to_shuffle_2:
1170 ; X64: ## BB#0: ## %entry
1171 ; X64-NEXT: xorps %xmm1, %xmm1
1172 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1175 %vecext = extractelement <4 x float> %A, i32 1
1176 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1177 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1178 ret <4 x float> %vecinit1