1 ; Tests for SSE2 and below, without SSE3+.
2 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
4 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
5 %tmp3 = load <2 x double>* %A, align 16
6 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
7 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
8 store <2 x double> %tmp9, <2 x double>* %r, align 16
12 ; CHECK: movl 8(%esp), %eax
13 ; CHECK-NEXT: movapd (%eax), %xmm0
14 ; CHECK-NEXT: movlpd 12(%esp), %xmm0
15 ; CHECK-NEXT: movl 4(%esp), %eax
16 ; CHECK-NEXT: movapd %xmm0, (%eax)
20 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
21 %tmp3 = load <2 x double>* %A, align 16
22 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
23 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
24 store <2 x double> %tmp9, <2 x double>* %r, align 16
28 ; CHECK: movl 8(%esp), %eax
29 ; CHECK-NEXT: movapd (%eax), %xmm0
30 ; CHECK-NEXT: movhpd 12(%esp), %xmm0
31 ; CHECK-NEXT: movl 4(%esp), %eax
32 ; CHECK-NEXT: movapd %xmm0, (%eax)
37 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
38 %tmp = load <4 x float>* %B ; <<4 x float>> [#uses=2]
39 %tmp3 = load <4 x float>* %A ; <<4 x float>> [#uses=2]
40 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
41 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
42 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1]
43 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
44 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1]
45 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
46 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
47 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
48 store <4 x float> %tmp13, <4 x float>* %res
54 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
55 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
56 store <4 x float> %tmp5, <4 x float>* %res
59 ; CHECK: pshufd $50, %xmm0, %xmm0
62 define <4 x i32> @test5(i8** %ptr) nounwind {
68 %tmp = load i8** %ptr ; <i8*> [#uses=1]
69 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1]
70 %tmp.upgrd.2 = load float* %tmp.upgrd.1 ; <float> [#uses=1]
71 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1]
72 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
73 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
74 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
75 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1]
76 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1]
77 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1]
78 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1]
79 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1]
83 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
84 %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1]
85 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
86 store <4 x float> %tmp2, <4 x float>* %res
90 ; CHECK: movaps (%eax), %xmm0
91 ; CHECK: movaps %xmm0, (%eax)
94 define void @test7() nounwind {
95 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
96 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
97 store <4 x float> %2, <4 x float>* null
101 ; CHECK: pxor %xmm0, %xmm0
102 ; CHECK: movaps %xmm0, 0
105 @x = external global [4 x i32]
107 define <2 x i64> @test8() nounwind {
108 %tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
109 %tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1]
110 %tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1]
111 %tmp7 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1]
112 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1]
113 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1]
114 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1]
115 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
116 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
119 ; CHECK: movups (%eax), %xmm0
122 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
123 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
124 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
125 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
126 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
127 ret <4 x float> %tmp13
129 ; CHECK: movups 8(%esp), %xmm0
132 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
133 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
134 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
135 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
136 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
137 ret <4 x float> %tmp13
139 ; CHECK: movaps 4(%esp), %xmm0
142 define <2 x double> @test11(double %a, double %b) nounwind {
143 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
144 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
145 ret <2 x double> %tmp7
147 ; CHECK: movapd 4(%esp), %xmm0
150 define void @test12() nounwind {
151 %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2]
152 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
153 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
154 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
155 store <4 x float> %tmp4, <4 x float>* null
162 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
163 %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1]
164 %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1]
165 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
166 store <4 x float> %tmp11, <4 x float>* %res
169 ; CHECK: shufps $69, (%eax), %xmm0
170 ; CHECK: pshufd $-40, %xmm0, %xmm0
173 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
174 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2]
175 %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2]
176 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
177 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
178 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
179 ret <4 x float> %tmp27
181 ; CHECK: addps %xmm1, %xmm0
182 ; CHECK: subps %xmm1, %xmm2
183 ; CHECK: movlhps %xmm2, %xmm0
186 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
188 %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1]
189 %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1]
190 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
191 ret <4 x float> %tmp4
193 ; CHECK: movhlps %xmm1, %xmm0
201 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
202 %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
203 %i6 = load <4 x double>* %i5, align 32
204 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
209 define fastcc void @test17() nounwind {
211 %0 = insertelement <4 x i32> undef, i32 undef, i32 1
212 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
213 %2 = bitcast <4 x i32> %1 to <4 x float>
214 store <4 x float> %2, <4 x float> * undef
219 define <4 x float> @f(<4 x double>) nounwind {
221 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
222 ret <4 x float> %double2float.i