1 ; Tests for SSE2 and below, without SSE3+.
2 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
4 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
7 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
9 ; CHECK-NEXT: movapd (%ecx), %xmm0
10 ; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0
11 ; CHECK-NEXT: movapd %xmm0, (%eax)
13 %tmp3 = load <2 x double>* %A, align 16
14 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
15 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
16 store <2 x double> %tmp9, <2 x double>* %r, align 16
20 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
23 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
24 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
25 ; CHECK-NEXT: movapd (%ecx), %xmm0
26 ; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0
27 ; CHECK-NEXT: movapd %xmm0, (%eax)
29 %tmp3 = load <2 x double>* %A, align 16
30 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
31 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
32 store <2 x double> %tmp9, <2 x double>* %r, align 16
37 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
40 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
41 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
42 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
43 ; CHECK-NEXT: movaps (%edx), %xmm0
44 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
45 ; CHECK-NEXT: movaps %xmm0, (%eax)
47 %tmp = load <4 x float>* %B ; <<4 x float>> [#uses=2]
48 %tmp3 = load <4 x float>* %A ; <<4 x float>> [#uses=2]
49 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
50 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
51 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1]
52 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
53 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1]
54 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
55 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
56 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
57 store <4 x float> %tmp13, <4 x float>* %res
61 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
64 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
65 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0]
66 ; CHECK-NEXT: movdqa %xmm0, (%eax)
68 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
69 store <4 x float> %tmp5, <4 x float>* %res
73 define <4 x i32> @test5(i8** %ptr) nounwind {
76 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
77 ; CHECK-NEXT: movl (%eax), %eax
78 ; CHECK-NEXT: movss (%eax), %xmm1
79 ; CHECK-NEXT: pxor %xmm0, %xmm0
80 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
81 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
83 %tmp = load i8** %ptr ; <i8*> [#uses=1]
84 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1]
85 %tmp.upgrd.2 = load float* %tmp.upgrd.1 ; <float> [#uses=1]
86 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1]
87 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
88 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
89 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
90 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1]
91 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1]
92 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1]
93 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1]
94 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1]
98 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
101 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
103 ; CHECK-NEXT: movaps (%ecx), %xmm0
104 ; CHECK-NEXT: movaps %xmm0, (%eax)
106 %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1]
107 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
108 store <4 x float> %tmp2, <4 x float>* %res
112 define void @test7() nounwind {
113 ; CHECK-LABEL: test7:
115 ; CHECK-NEXT: xorps %xmm0, %xmm0
116 ; CHECK-NEXT: movaps %xmm0, 0
118 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
119 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
120 store <4 x float> %2, <4 x float>* null
124 @x = external global [4 x i32]
126 define <2 x i64> @test8() nounwind {
127 ; CHECK-LABEL: test8:
129 ; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax
130 ; CHECK-NEXT: movups (%eax), %xmm0
132 %tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
133 %tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1]
134 %tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1]
135 %tmp7 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1]
136 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1]
137 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1]
138 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1]
139 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
140 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
144 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
145 ; CHECK-LABEL: test9:
147 ; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0
149 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
150 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
151 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
152 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
153 ret <4 x float> %tmp13
156 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
157 ; CHECK-LABEL: test10:
159 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
161 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
162 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
163 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
164 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
165 ret <4 x float> %tmp13
168 define <2 x double> @test11(double %a, double %b) nounwind {
169 ; CHECK-LABEL: test11:
171 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
173 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
174 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
175 ret <2 x double> %tmp7
178 define void @test12() nounwind {
179 ; CHECK-LABEL: test12:
181 ; CHECK-NEXT: movaps 0, %xmm0
182 ; CHECK-NEXT: xorps %xmm1, %xmm1
183 ; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
184 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
185 ; CHECK-NEXT: addps %xmm1, %xmm0
186 ; CHECK-NEXT: movaps %xmm0, 0
188 %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2]
189 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
190 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
191 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
192 store <4 x float> %tmp4, <4 x float>* null
196 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
197 ; CHECK-LABEL: test13:
199 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
200 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
201 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
202 ; CHECK-NEXT: movaps (%edx), %xmm0
203 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
204 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
205 ; CHECK-NEXT: movdqa %xmm0, (%eax)
207 %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1]
208 %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1]
209 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
210 store <4 x float> %tmp11, <4 x float>* %res
214 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
215 ; CHECK-LABEL: test14:
217 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
218 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
219 ; CHECK-NEXT: movaps (%ecx), %xmm1
220 ; CHECK-NEXT: movaps (%eax), %xmm2
221 ; CHECK-NEXT: movaps %xmm2, %xmm0
222 ; CHECK-NEXT: addps %xmm1, %xmm0
223 ; CHECK-NEXT: subps %xmm1, %xmm2
224 ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
226 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2]
227 %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2]
228 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
229 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
230 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
231 ret <4 x float> %tmp27
234 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
235 ; CHECK-LABEL: test15:
236 ; CHECK: ## BB#0: ## %entry
237 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
238 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
239 ; CHECK-NEXT: movaps (%ecx), %xmm0
240 ; CHECK-NEXT: movaps (%eax), %xmm1
241 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
244 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1]
245 %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1]
246 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
247 ret <4 x float> %tmp4
252 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
253 ; CHECK-LABEL: test16:
255 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
256 ; CHECK-NEXT: movapd 96(%eax), %xmm0
257 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
259 %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
260 %i6 = load <4 x double>* %i5, align 32
261 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
266 define fastcc void @test17() nounwind {
267 ; CHECK-LABEL: test17:
268 ; CHECK: ## BB#0: ## %entry
269 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
270 ; CHECK-NEXT: movaps %xmm0, (%eax)
273 %0 = insertelement <4 x i32> undef, i32 undef, i32 1
274 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
275 %2 = bitcast <4 x i32> %1 to <4 x float>
276 store <4 x float> %2, <4 x float> * undef
281 define <4 x float> @f(<4 x double>) nounwind {
283 ; CHECK: ## BB#0: ## %entry
284 ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
285 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
286 ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
289 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
290 ret <4 x float> %double2float.i
293 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
294 ; CHECK-LABEL: test_insert_64_zext:
296 ; CHECK-NEXT: movq %xmm0, %xmm0
298 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
302 define <4 x i32> @PR19721(<4 x i32> %i) {
303 ; CHECK-LABEL: PR19721:
305 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,0,0]
306 ; CHECK-NEXT: movd %xmm1, %eax
307 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,0,0,0]
308 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
309 ; CHECK-NEXT: movd %xmm0, %ecx
310 ; CHECK-NEXT: movd %xmm1, %edx
311 ; CHECK-NEXT: movd %edx, %xmm0
312 ; CHECK-NEXT: movd %ecx, %xmm1
313 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
314 ; CHECK-NEXT: movd %eax, %xmm0
315 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
317 %bc = bitcast <4 x i32> %i to i128
318 %insert = and i128 %bc, -4294967296
319 %bc2 = bitcast i128 %insert to <4 x i32>
323 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
324 ; CHECK-LABEL: test_mul:
326 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,0]
327 ; CHECK-NEXT: pmuludq %xmm1, %xmm0
328 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0]
329 ; CHECK-NEXT: pmuludq %xmm2, %xmm1
330 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
331 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
333 %m = mul <4 x i32> %x, %y