1 ; Tests for SSE2 and below, without SSE3+.
2 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
4 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
5 %tmp3 = load <2 x double>* %A, align 16
6 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
7 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
8 store <2 x double> %tmp9, <2 x double>* %r, align 16
12 ; CHECK: movl 4(%esp), %eax
13 ; CHECK-NEXT: movl 8(%esp), %ecx
14 ; CHECK-NEXT: movapd (%ecx), %xmm0
15 ; CHECK-NEXT: movlpd 12(%esp), %xmm0
16 ; CHECK-NEXT: movapd %xmm0, (%eax)
20 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
21 %tmp3 = load <2 x double>* %A, align 16
22 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
23 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
24 store <2 x double> %tmp9, <2 x double>* %r, align 16
28 ; CHECK: movl 4(%esp), %eax
29 ; CHECK: movl 8(%esp), %ecx
30 ; CHECK-NEXT: movapd (%ecx), %xmm0
31 ; CHECK-NEXT: movhpd 12(%esp), %xmm0
32 ; CHECK-NEXT: movapd %xmm0, (%eax)
37 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
38 %tmp = load <4 x float>* %B ; <<4 x float>> [#uses=2]
39 %tmp3 = load <4 x float>* %A ; <<4 x float>> [#uses=2]
40 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
41 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
42 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1]
43 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
44 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1]
45 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
46 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
47 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
48 store <4 x float> %tmp13, <4 x float>* %res
54 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
55 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
56 store <4 x float> %tmp5, <4 x float>* %res
59 ; CHECK: pshufd $50, %xmm0, %xmm0
62 define <4 x i32> @test5(i8** %ptr) nounwind {
68 %tmp = load i8** %ptr ; <i8*> [#uses=1]
69 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1]
70 %tmp.upgrd.2 = load float* %tmp.upgrd.1 ; <float> [#uses=1]
71 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1]
72 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
73 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
74 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
75 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1]
76 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1]
77 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1]
78 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1]
79 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1]
83 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
84 %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1]
85 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
86 store <4 x float> %tmp2, <4 x float>* %res
90 ; CHECK: movaps (%ecx), %xmm0
91 ; CHECK: movaps %xmm0, (%eax)
94 define void @test7() nounwind {
95 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
96 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
97 store <4 x float> %2, <4 x float>* null
100 ; CHECK-LABEL: test7:
101 ; CHECK: xorps %xmm0, %xmm0
102 ; CHECK: movaps %xmm0, 0
105 @x = external global [4 x i32]
107 define <2 x i64> @test8() nounwind {
108 %tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
109 %tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1]
110 %tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1]
111 %tmp7 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1]
112 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1]
113 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1]
114 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1]
115 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
116 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
118 ; CHECK-LABEL: test8:
119 ; CHECK: movups (%eax), %xmm0
122 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
123 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
124 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
125 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
126 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
127 ret <4 x float> %tmp13
128 ; CHECK-LABEL: test9:
129 ; CHECK: movups 8(%esp), %xmm0
132 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
133 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
134 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
135 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
136 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
137 ret <4 x float> %tmp13
138 ; CHECK-LABEL: test10:
139 ; CHECK: movaps 4(%esp), %xmm0
142 define <2 x double> @test11(double %a, double %b) nounwind {
143 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
144 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
145 ret <2 x double> %tmp7
146 ; CHECK-LABEL: test11:
147 ; CHECK: movaps 4(%esp), %xmm0
150 define void @test12() nounwind {
151 %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2]
152 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
153 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
154 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
155 store <4 x float> %tmp4, <4 x float>* null
157 ; CHECK-LABEL: test12:
162 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
163 %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1]
164 %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1]
165 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
166 store <4 x float> %tmp11, <4 x float>* %res
169 ; CHECK: shufps $69, (%ecx), %xmm0
170 ; CHECK: pshufd $-40, %xmm0, %xmm0
173 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
174 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2]
175 %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2]
176 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
177 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
178 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
179 ret <4 x float> %tmp27
180 ; CHECK-LABEL: test14:
181 ; CHECK: addps [[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
182 ; CHECK: subps [[X1]], [[X2:%xmm[0-9]+]]
183 ; CHECK: movlhps [[X2]], [[X0]]
186 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
188 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1]
189 %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1]
190 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
191 ret <4 x float> %tmp4
192 ; CHECK-LABEL: test15:
193 ; CHECK: movhlps %xmm1, %xmm0
197 ; CHECK-LABEL: test16:
201 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
202 %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
203 %i6 = load <4 x double>* %i5, align 32
204 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
209 define fastcc void @test17() nounwind {
211 %0 = insertelement <4 x i32> undef, i32 undef, i32 1
212 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
213 %2 = bitcast <4 x i32> %1 to <4 x float>
214 store <4 x float> %2, <4 x float> * undef
219 define <4 x float> @f(<4 x double>) nounwind {
221 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
222 ret <4 x float> %double2float.i
225 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
226 ; CHECK-LABEL: test_insert_64_zext
229 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
233 define <4 x i32> @PR19721(<4 x i32> %i) {
234 %bc = bitcast <4 x i32> %i to i128
235 %insert = and i128 %bc, -4294967296
236 %bc2 = bitcast i128 %insert to <4 x i32>
239 ; CHECK-LABEL: PR19721