1 ; RUN: opt < %s -instcombine -S | FileCheck %s
2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4 define i16 @test1(float %f) {
8 ; CHECK-NOT: insertelement {{.*}} 0.00
9 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul
10 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub
12 %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
13 %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
14 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
15 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
16 %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
17 %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
18 %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
19 %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1]
20 %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
21 %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1]
25 define i32 @test2(float %f) {
26 ; CHECK-LABEL: @test2(
27 ; CHECK-NOT: insertelement
28 ; CHECK-NOT: extractelement
30 %tmp5 = fmul float %f, %f
31 %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0
32 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
33 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
34 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
35 %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32>
36 %tmp21 = extractelement <4 x i32> %tmp19, i32 0
40 define i64 @test3(float %f, double %d) {
41 ; CHECK-LABEL: @test3(
42 ; CHECK-NOT: insertelement {{.*}} 0.00
45 %v00 = insertelement <4 x float> undef, float %f, i32 0
46 %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
47 %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
48 %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
49 %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
50 %v10 = insertelement <4 x float> undef, float %f, i32 0
51 %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
52 %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
53 %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
54 %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
55 %v20 = insertelement <4 x float> undef, float %f, i32 0
56 %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
57 %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
58 %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
59 %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
60 %v30 = insertelement <4 x float> undef, float %f, i32 0
61 %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
62 %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
63 %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
64 %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
65 %v40 = insertelement <2 x double> undef, double %d, i32 0
66 %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
67 %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
68 %v50 = insertelement <2 x double> undef, double %d, i32 0
69 %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
70 %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
71 %v60 = insertelement <2 x double> undef, double %d, i32 0
72 %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
73 %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
74 %v70 = insertelement <2 x double> undef, double %d, i32 0
75 %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
76 %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
77 %tmp8 = add i32 %tmp0, %tmp2
78 %tmp9 = add i32 %tmp4, %tmp6
79 %tmp10 = add i32 %tmp8, %tmp9
80 %tmp11 = sext i32 %tmp10 to i64
81 %tmp12 = add i64 %tmp1, %tmp3
82 %tmp13 = add i64 %tmp5, %tmp7
83 %tmp14 = add i64 %tmp12, %tmp13
84 %tmp15 = add i64 %tmp11, %tmp14
88 define void @get_image() nounwind {
89 ; CHECK-LABEL: @get_image(
90 ; CHECK-NOT: extractelement
93 %0 = call i32 @fgetc(i8* null) nounwind ; <i32> [#uses=1]
94 %1 = trunc i32 %0 to i8 ; <i8> [#uses=1]
95 %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1 ; <<100 x i8>> [#uses=1]
96 %tmp1 = extractelement <100 x i8> %tmp2, i32 0 ; <i8> [#uses=1]
97 %2 = icmp eq i8 %tmp1, 80 ; <i1> [#uses=1]
98 br i1 %2, label %bb2, label %bb3
100 bb2: ; preds = %entry
103 bb3: ; preds = %bb2, %entry
108 define void @vac(<4 x float>* nocapture %a) nounwind {
113 %tmp1 = load <4 x float>* %a ; <<4 x float>> [#uses=1]
114 %vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0 ; <<4 x float>> [#uses=1]
115 %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1]
116 %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1]
117 %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1]
118 store <4 x float> %vecins8, <4 x float>* %a
122 declare i32 @fgetc(i8*)
124 declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
126 declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
128 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
130 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
132 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
133 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
134 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
135 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
136 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
137 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
138 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
139 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
141 ; <rdar://problem/6945110>
142 define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind {
144 %tmp = load <4 x i16>* %src
145 %tmp1 = load <8 x i16>* %foo
146 ; CHECK: %tmp2 = shufflevector
147 %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
148 ; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle:
149 ; CHECK-NOT: shufflevector
150 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
151 ; CHECK-NEXT: pmovzxwd
152 %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
155 declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
157 define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
159 ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt(
160 ; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
161 %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
162 %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
163 ret <4 x float> %shuffle9.i
166 define <2 x float> @test_fptrunc(double %f) {
167 ; CHECK-LABEL: @test_fptrunc(
168 ; CHECK: insertelement
169 ; CHECK: insertelement
170 ; CHECK-NOT: insertelement
171 %tmp9 = insertelement <4 x double> undef, double %f, i32 0
172 %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
173 %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2
174 %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3
175 %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float>
176 %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
180 define <2 x double> @test_fpext(float %f) {
181 ; CHECK-LABEL: @test_fpext(
182 ; CHECK: insertelement
183 ; CHECK: insertelement
184 ; CHECK-NOT: insertelement
185 %tmp9 = insertelement <4 x float> undef, float %f, i32 0
186 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
187 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
188 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
189 %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
190 %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
191 ret <2 x double> %ret
194 define <4 x float> @test_select(float %f, float %g) {
195 ; CHECK-LABEL: @test_select(
196 ; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0
197 ; CHECK-NOT: insertelement
198 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
199 ; CHECK-NOT: insertelement
200 ; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
201 %a0 = insertelement <4 x float> undef, float %f, i32 0
202 %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
203 %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
204 %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3
205 %b0 = insertelement <4 x float> undef, float %g, i32 0
206 %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
207 %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
208 %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
209 %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
213 ; We should optimize these two redundant insertqi into one
214 ; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
215 define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
216 ; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
217 ; CHECK-NOT: insertqi
218 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
219 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
223 ; The result of this insert is the second arg, since the top 64 bits of
224 ; the result are undefined, and we copy the bottom 64 bits from the
226 ; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
227 define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
228 ; CHECK: ret <2 x i64> %i
229 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
233 ; Test the several types of ranges and ordering that exist for two insertqi
234 ; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
235 define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
236 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
237 ; CHECK: ret <2 x i64> %[[RES]]
238 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
239 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
243 ; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
244 define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
245 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
246 ; CHECK: ret <2 x i64> %[[RES]]
247 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
248 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
252 ; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
253 define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
254 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
255 ; CHECK: ret <2 x i64> %[[RES]]
256 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
257 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
261 ; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
262 define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
263 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
264 ; CHECK: ret <2 x i64> %[[RES]]
265 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
266 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
270 ; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
271 define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
272 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
273 ; CHECK: ret <2 x i64> %[[RES]]
274 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
275 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
279 ; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
280 define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
281 ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
282 ; CHECK: ret <2 x i64> %[[RES]]
283 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
284 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
288 ; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
289 define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
290 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
291 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
292 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
293 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
297 ; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
298 define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
299 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
300 ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
301 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
302 %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
307 ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
308 declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
310 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
311 define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
312 ; CHECK-LABEL: @test_vpermilvar_ps(
313 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
314 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
318 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
319 define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
320 ; CHECK-LABEL: @test_vpermilvar_ps_256(
321 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
322 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
326 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i32>)
327 define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
328 ; CHECK-LABEL: @test_vpermilvar_pd(
329 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
330 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> <i32 1, i32 0>)
334 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i32>)
335 define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
336 ; CHECK-LABEL: @test_vpermilvar_pd_256(
337 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
338 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
342 define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
343 %S = bitcast i32 1 to i32
344 %1 = zext i32 %S to i64
345 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
346 %3 = insertelement <2 x i64> %2, i64 0, i32 1
347 %4 = bitcast <2 x i64> %3 to <8 x i16>
348 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
349 %6 = bitcast <8 x i16> %5 to <4 x i32>
350 %7 = bitcast <2 x i64> %3 to <4 x i32>
351 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
352 %9 = bitcast <4 x i32> %8 to <2 x i64>
353 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
354 %11 = bitcast <2 x i64> %10 to <8 x i16>
355 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
356 %13 = bitcast <8 x i16> %12 to <4 x i32>
357 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
358 %15 = bitcast <4 x i32> %14 to <2 x i64>
359 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
363 ; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
366 define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
367 %S = bitcast i32 1 to i32
368 %1 = zext i32 %S to i64
369 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
370 %3 = insertelement <2 x i64> %2, i64 0, i32 1
371 %4 = bitcast <2 x i64> %3 to <8 x i16>
372 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
373 %6 = bitcast <16 x i16> %5 to <8 x i32>
374 %7 = bitcast <2 x i64> %3 to <4 x i32>
375 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
376 %9 = bitcast <8 x i32> %8 to <4 x i64>
377 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
378 %11 = bitcast <4 x i64> %10 to <16 x i16>
379 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
380 %13 = bitcast <16 x i16> %12 to <8 x i32>
381 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
382 %15 = bitcast <8 x i32> %14 to <4 x i64>
383 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
386 ; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
389 define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
390 %S = bitcast i32 128 to i32
391 %1 = zext i32 %S to i64
392 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
393 %3 = insertelement <2 x i64> %2, i64 0, i32 1
394 %4 = bitcast <2 x i64> %3 to <8 x i16>
395 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
396 %6 = bitcast <8 x i16> %5 to <4 x i32>
397 %7 = bitcast <2 x i64> %3 to <4 x i32>
398 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
399 %9 = bitcast <4 x i32> %8 to <2 x i64>
400 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
401 %11 = bitcast <2 x i64> %10 to <8 x i16>
402 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
403 %13 = bitcast <8 x i16> %12 to <4 x i32>
404 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
405 %15 = bitcast <4 x i32> %14 to <2 x i64>
406 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
410 ; CHECK: ret <2 x i64> zeroinitializer
413 define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
414 %S = bitcast i32 128 to i32
415 %1 = zext i32 %S to i64
416 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
417 %3 = insertelement <2 x i64> %2, i64 0, i32 1
418 %4 = bitcast <2 x i64> %3 to <8 x i16>
419 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
420 %6 = bitcast <16 x i16> %5 to <8 x i32>
421 %7 = bitcast <2 x i64> %3 to <4 x i32>
422 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
423 %9 = bitcast <8 x i32> %8 to <4 x i64>
424 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
425 %11 = bitcast <4 x i64> %10 to <16 x i16>
426 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
427 %13 = bitcast <16 x i16> %12 to <8 x i32>
428 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
429 %15 = bitcast <8 x i32> %14 to <4 x i64>
430 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
433 ; CHECK: ret <4 x i64> zeroinitializer
436 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
437 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
438 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
439 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
440 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
441 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
442 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
443 declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
444 declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
445 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
446 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
447 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
449 attributes #1 = { nounwind readnone }