1 ; Tests for SSE2 and below, without SSE3+.
2 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
4 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
7 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
9 ; CHECK-NEXT: movapd (%ecx), %xmm0
10 ; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0
11 ; CHECK-NEXT: movapd %xmm0, (%eax)
13 %tmp3 = load <2 x double>, <2 x double>* %A, align 16
14 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
15 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
16 store <2 x double> %tmp9, <2 x double>* %r, align 16
20 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
23 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
24 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
25 ; CHECK-NEXT: movapd (%ecx), %xmm0
26 ; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0
27 ; CHECK-NEXT: movapd %xmm0, (%eax)
29 %tmp3 = load <2 x double>, <2 x double>* %A, align 16
30 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
31 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
32 store <2 x double> %tmp9, <2 x double>* %r, align 16
37 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
40 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
41 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
42 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
43 ; CHECK-NEXT: movaps (%edx), %xmm0
44 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
45 ; CHECK-NEXT: movaps %xmm0, (%eax)
47 %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2]
48 %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2]
49 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
50 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
51 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1]
52 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
53 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1]
54 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
55 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
56 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
57 store <4 x float> %tmp13, <4 x float>* %res
61 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
64 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
65 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
66 ; CHECK-NEXT: movaps %xmm0, (%eax)
68 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
69 store <4 x float> %tmp5, <4 x float>* %res
73 define <4 x i32> @test5(i8** %ptr) nounwind {
76 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
77 ; CHECK-NEXT: movl (%eax), %eax
78 ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
79 ; CHECK-NEXT: pxor %xmm0, %xmm0
80 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
81 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
83 %tmp = load i8*, i8** %ptr ; <i8*> [#uses=1]
84 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1]
85 %tmp.upgrd.2 = load float, float* %tmp.upgrd.1 ; <float> [#uses=1]
86 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1]
87 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
88 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
89 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
90 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1]
91 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1]
92 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1]
93 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1]
94 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1]
98 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
101 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
103 ; CHECK-NEXT: movaps (%ecx), %xmm0
104 ; CHECK-NEXT: movaps %xmm0, (%eax)
106 %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1]
107 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
108 store <4 x float> %tmp2, <4 x float>* %res
112 define void @test7() nounwind {
113 ; CHECK-LABEL: test7:
115 ; CHECK-NEXT: xorps %xmm0, %xmm0
116 ; CHECK-NEXT: movaps %xmm0, 0
118 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
119 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
120 store <4 x float> %2, <4 x float>* null
124 @x = external global [4 x i32]
126 define <2 x i64> @test8() nounwind {
127 ; CHECK-LABEL: test8:
129 ; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax
130 ; CHECK-NEXT: movups (%eax), %xmm0
132 %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
133 %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1]
134 %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1]
135 %tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1]
136 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1]
137 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1]
138 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1]
139 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
140 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
144 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
145 ; CHECK-LABEL: test9:
147 ; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0
149 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
150 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
151 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
152 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
153 ret <4 x float> %tmp13
156 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
157 ; CHECK-LABEL: test10:
159 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
161 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
162 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
163 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
164 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
165 ret <4 x float> %tmp13
168 define <2 x double> @test11(double %a, double %b) nounwind {
169 ; CHECK-LABEL: test11:
171 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
173 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
174 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
175 ret <2 x double> %tmp7
178 define void @test12() nounwind {
179 ; CHECK-LABEL: test12:
181 ; CHECK-NEXT: movapd 0, %xmm0
182 ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
183 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
184 ; CHECK-NEXT: xorpd %xmm2, %xmm2
185 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
186 ; CHECK-NEXT: addps %xmm1, %xmm0
187 ; CHECK-NEXT: movaps %xmm0, 0
189 %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2]
190 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
191 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
192 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
193 store <4 x float> %tmp4, <4 x float>* null
197 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
198 ; CHECK-LABEL: test13:
200 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
201 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
202 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
203 ; CHECK-NEXT: movaps (%edx), %xmm0
204 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
205 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
206 ; CHECK-NEXT: movaps %xmm0, (%eax)
208 %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1]
209 %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1]
210 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
211 store <4 x float> %tmp11, <4 x float>* %res
215 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
216 ; CHECK-LABEL: test14:
218 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
219 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
220 ; CHECK-NEXT: movaps (%ecx), %xmm1
221 ; CHECK-NEXT: movaps (%eax), %xmm2
222 ; CHECK-NEXT: movaps %xmm2, %xmm0
223 ; CHECK-NEXT: addps %xmm1, %xmm0
224 ; CHECK-NEXT: subps %xmm1, %xmm2
225 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
227 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2]
228 %tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2]
229 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
230 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
231 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
232 ret <4 x float> %tmp27
235 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
236 ; CHECK-LABEL: test15:
237 ; CHECK: ## BB#0: ## %entry
238 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
239 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
240 ; CHECK-NEXT: movapd (%ecx), %xmm0
241 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
244 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1]
245 %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1]
246 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
247 ret <4 x float> %tmp4
252 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
253 ; CHECK-LABEL: test16:
255 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
256 ; CHECK-NEXT: movapd 96(%eax), %xmm0
257 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
259 %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
260 %i6 = load <4 x double>, <4 x double>* %i5, align 32
261 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
266 define fastcc void @test17() nounwind {
267 ; CHECK-LABEL: test17:
268 ; CHECK: ## BB#0: ## %entry
269 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
270 ; CHECK-NEXT: movaps %xmm0, (%eax)
273 %0 = insertelement <4 x i32> undef, i32 undef, i32 1
274 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
275 %2 = bitcast <4 x i32> %1 to <4 x float>
276 store <4 x float> %2, <4 x float> * undef
281 define <4 x float> @f(<4 x double>) nounwind {
283 ; CHECK: ## BB#0: ## %entry
284 ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
285 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
286 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
289 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
290 ret <4 x float> %double2float.i
293 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
294 ; CHECK-LABEL: test_insert_64_zext:
296 ; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
298 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
302 define <4 x i32> @PR19721(<4 x i32> %i) {
303 ; CHECK-LABEL: PR19721:
305 ; CHECK-NEXT: andps LCPI19_0, %xmm0
307 %bc = bitcast <4 x i32> %i to i128
308 %insert = and i128 %bc, -4294967296
309 %bc2 = bitcast i128 %insert to <4 x i32>
313 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
314 ; CHECK-LABEL: test_mul:
316 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
317 ; CHECK-NEXT: pmuludq %xmm1, %xmm0
318 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
319 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
320 ; CHECK-NEXT: pmuludq %xmm2, %xmm1
321 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
322 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324 %m = mul <4 x i32> %x, %y