1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; Signed Integer to Double
9 define <2 x double> @sitofp_2vf64(<2 x i64> %a) {
10 ; SSE2-LABEL: sitofp_2vf64:
12 ; SSE2-NEXT: movd %xmm0, %rax
13 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm1
14 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
15 ; SSE2-NEXT: movd %xmm0, %rax
16 ; SSE2-NEXT: xorps %xmm0, %xmm0
17 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm0
18 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
19 ; SSE2-NEXT: movapd %xmm1, %xmm0
22 ; AVX-LABEL: sitofp_2vf64:
24 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
25 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
26 ; AVX-NEXT: vmovq %xmm0, %rax
27 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
28 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
29 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
31 %cvt = sitofp <2 x i64> %a to <2 x double>
35 define <2 x double> @sitofp_2vf64_i32(<4 x i32> %a) {
36 ; SSE2-LABEL: sitofp_2vf64_i32:
38 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
41 ; AVX-LABEL: sitofp_2vf64_i32:
43 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
45 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
46 %cvt = sitofp <2 x i32> %shuf to <2 x double>
50 define <2 x double> @sitofp_2vf64_i16(<8 x i16> %a) {
51 ; SSE2-LABEL: sitofp_2vf64_i16:
53 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
54 ; SSE2-NEXT: psrad $16, %xmm0
55 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
58 ; AVX-LABEL: sitofp_2vf64_i16:
60 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
61 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
63 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
64 %cvt = sitofp <2 x i16> %shuf to <2 x double>
68 define <2 x double> @sitofp_2vf64_i8(<16 x i8> %a) {
69 ; SSE2-LABEL: sitofp_2vf64_i8:
71 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
72 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
73 ; SSE2-NEXT: psrad $24, %xmm0
74 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
77 ; AVX-LABEL: sitofp_2vf64_i8:
79 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
80 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
82 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
83 %cvt = sitofp <2 x i8> %shuf to <2 x double>
87 define <4 x double> @sitofp_4vf64(<4 x i64> %a) {
88 ; SSE2-LABEL: sitofp_4vf64:
90 ; SSE2-NEXT: movd %xmm0, %rax
91 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm2
92 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
93 ; SSE2-NEXT: movd %xmm0, %rax
94 ; SSE2-NEXT: xorps %xmm0, %xmm0
95 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm0
96 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
97 ; SSE2-NEXT: movd %xmm1, %rax
98 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm3
99 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
100 ; SSE2-NEXT: movd %xmm0, %rax
101 ; SSE2-NEXT: xorps %xmm0, %xmm0
102 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm0
103 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
104 ; SSE2-NEXT: movapd %xmm2, %xmm0
105 ; SSE2-NEXT: movapd %xmm3, %xmm1
108 ; AVX1-LABEL: sitofp_4vf64:
110 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
111 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
112 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
113 ; AVX1-NEXT: vmovq %xmm1, %rax
114 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
115 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
116 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
117 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
118 ; AVX1-NEXT: vmovq %xmm0, %rax
119 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
120 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
121 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
122 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
125 ; AVX2-LABEL: sitofp_4vf64:
127 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
128 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
129 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
130 ; AVX2-NEXT: vmovq %xmm1, %rax
131 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
132 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
133 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
134 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
135 ; AVX2-NEXT: vmovq %xmm0, %rax
136 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
137 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
138 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
139 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
141 %cvt = sitofp <4 x i64> %a to <4 x double>
142 ret <4 x double> %cvt
145 define <4 x double> @sitofp_4vf64_i32(<4 x i32> %a) {
146 ; SSE2-LABEL: sitofp_4vf64_i32:
148 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
149 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
150 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
151 ; SSE2-NEXT: movaps %xmm2, %xmm0
154 ; AVX-LABEL: sitofp_4vf64_i32:
156 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
158 %cvt = sitofp <4 x i32> %a to <4 x double>
159 ret <4 x double> %cvt
162 define <4 x double> @sitofp_4vf64_i16(<8 x i16> %a) {
163 ; SSE2-LABEL: sitofp_4vf64_i16:
165 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
166 ; SSE2-NEXT: psrad $16, %xmm1
167 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
168 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
169 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
172 ; AVX-LABEL: sitofp_4vf64_i16:
174 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
175 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
177 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
178 %cvt = sitofp <4 x i16> %shuf to <4 x double>
179 ret <4 x double> %cvt
182 define <4 x double> @sitofp_4vf64_i8(<16 x i8> %a) {
183 ; SSE2-LABEL: sitofp_4vf64_i8:
185 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
186 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
187 ; SSE2-NEXT: psrad $24, %xmm1
188 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
189 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
190 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
193 ; AVX-LABEL: sitofp_4vf64_i8:
195 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
196 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
198 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
199 %cvt = sitofp <4 x i8> %shuf to <4 x double>
200 ret <4 x double> %cvt
204 ; Unsigned Integer to Double
207 define <2 x double> @uitofp_2vf64(<2 x i64> %a) {
208 ; SSE2-LABEL: uitofp_2vf64:
210 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
211 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
212 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
213 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
214 ; SSE2-NEXT: subpd %xmm3, %xmm0
215 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
216 ; SSE2-NEXT: addpd %xmm4, %xmm0
217 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
218 ; SSE2-NEXT: subpd %xmm3, %xmm2
219 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
220 ; SSE2-NEXT: addpd %xmm2, %xmm1
221 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
224 ; AVX-LABEL: uitofp_2vf64:
226 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
227 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
228 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
229 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
230 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
231 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
232 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
233 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
234 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
235 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
237 %cvt = uitofp <2 x i64> %a to <2 x double>
238 ret <2 x double> %cvt
241 define <2 x double> @uitofp_2vf64_i32(<4 x i32> %a) {
242 ; SSE2-LABEL: uitofp_2vf64_i32:
244 ; SSE2-NEXT: pxor %xmm1, %xmm1
245 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
246 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
247 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
248 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
249 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
250 ; SSE2-NEXT: subpd %xmm3, %xmm0
251 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
252 ; SSE2-NEXT: addpd %xmm4, %xmm0
253 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
254 ; SSE2-NEXT: subpd %xmm3, %xmm2
255 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
256 ; SSE2-NEXT: addpd %xmm2, %xmm1
257 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
260 ; AVX-LABEL: uitofp_2vf64_i32:
262 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
263 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
264 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
265 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
266 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
267 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
268 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
269 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
270 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
271 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
272 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
274 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
275 %cvt = uitofp <2 x i32> %shuf to <2 x double>
276 ret <2 x double> %cvt
279 define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
280 ; SSE2-LABEL: uitofp_2vf64_i16:
282 ; SSE2-NEXT: pxor %xmm1, %xmm1
283 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
284 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
285 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
286 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
287 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
288 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
289 ; SSE2-NEXT: subpd %xmm3, %xmm0
290 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
291 ; SSE2-NEXT: addpd %xmm4, %xmm0
292 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
293 ; SSE2-NEXT: subpd %xmm3, %xmm2
294 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
295 ; SSE2-NEXT: addpd %xmm2, %xmm1
296 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
299 ; AVX-LABEL: uitofp_2vf64_i16:
301 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
302 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
303 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
304 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
305 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
306 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
307 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
308 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
309 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
310 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
311 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
313 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
314 %cvt = uitofp <2 x i16> %shuf to <2 x double>
315 ret <2 x double> %cvt
318 define <2 x double> @uitofp_2vf64_i8(<16 x i8> %a) {
319 ; SSE2-LABEL: uitofp_2vf64_i8:
321 ; SSE2-NEXT: pxor %xmm1, %xmm1
322 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
323 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
324 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
325 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
326 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
327 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
328 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
329 ; SSE2-NEXT: subpd %xmm3, %xmm0
330 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
331 ; SSE2-NEXT: addpd %xmm4, %xmm0
332 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
333 ; SSE2-NEXT: subpd %xmm3, %xmm2
334 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
335 ; SSE2-NEXT: addpd %xmm2, %xmm1
336 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
339 ; AVX-LABEL: uitofp_2vf64_i8:
341 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
342 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
343 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
344 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
345 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
346 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
347 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
348 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
349 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
350 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
351 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
353 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
354 %cvt = uitofp <2 x i8> %shuf to <2 x double>
355 ret <2 x double> %cvt
358 define <4 x double> @uitofp_4vf64(<4 x i64> %a) {
359 ; SSE2-LABEL: uitofp_4vf64:
361 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
362 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
363 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
364 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
365 ; SSE2-NEXT: subpd %xmm4, %xmm0
366 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
367 ; SSE2-NEXT: addpd %xmm5, %xmm0
368 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
369 ; SSE2-NEXT: subpd %xmm4, %xmm3
370 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
371 ; SSE2-NEXT: addpd %xmm3, %xmm5
372 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
373 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
374 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
375 ; SSE2-NEXT: subpd %xmm4, %xmm1
376 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
377 ; SSE2-NEXT: addpd %xmm5, %xmm1
378 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
379 ; SSE2-NEXT: subpd %xmm4, %xmm3
380 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
381 ; SSE2-NEXT: addpd %xmm3, %xmm2
382 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
385 ; AVX1-LABEL: uitofp_4vf64:
387 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
388 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
389 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
390 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
391 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
392 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
393 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
394 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
395 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
396 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
397 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
398 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
399 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
400 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
401 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
402 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
403 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
404 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
405 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
406 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
409 ; AVX2-LABEL: uitofp_4vf64:
411 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
412 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
413 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
414 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
415 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
416 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
417 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
418 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
419 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
420 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
421 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
422 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
423 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
424 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
425 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
426 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
427 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
428 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
429 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
430 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
432 %cvt = uitofp <4 x i64> %a to <4 x double>
433 ret <4 x double> %cvt
436 define <4 x double> @uitofp_4vf64_i32(<4 x i32> %a) {
437 ; SSE2-LABEL: uitofp_4vf64_i32:
439 ; SSE2-NEXT: pxor %xmm1, %xmm1
440 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
441 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
442 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
443 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
444 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
445 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
446 ; SSE2-NEXT: subpd %xmm4, %xmm0
447 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
448 ; SSE2-NEXT: addpd %xmm5, %xmm0
449 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
450 ; SSE2-NEXT: subpd %xmm4, %xmm1
451 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
452 ; SSE2-NEXT: addpd %xmm1, %xmm5
453 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
454 ; SSE2-NEXT: pand .LCPI13_2(%rip), %xmm2
455 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
456 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
457 ; SSE2-NEXT: subpd %xmm4, %xmm2
458 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
459 ; SSE2-NEXT: addpd %xmm2, %xmm1
460 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
461 ; SSE2-NEXT: subpd %xmm4, %xmm5
462 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
463 ; SSE2-NEXT: addpd %xmm5, %xmm2
464 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
467 ; AVX1-LABEL: uitofp_4vf64_i32:
469 ; AVX1-NEXT: vpand .LCPI13_0(%rip), %xmm0, %xmm1
470 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
471 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
472 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
473 ; AVX1-NEXT: vmulpd .LCPI13_1(%rip), %ymm0, %ymm0
474 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
477 ; AVX2-LABEL: uitofp_4vf64_i32:
479 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
480 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
481 ; AVX2-NEXT: vbroadcastsd .LCPI13_0(%rip), %ymm2
482 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
483 ; AVX2-NEXT: vpbroadcastd .LCPI13_1(%rip), %xmm2
484 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
485 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
486 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
488 %cvt = uitofp <4 x i32> %a to <4 x double>
489 ret <4 x double> %cvt
492 define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
493 ; SSE2-LABEL: uitofp_4vf64_i16:
495 ; SSE2-NEXT: pxor %xmm1, %xmm1
496 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
497 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
498 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
499 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
500 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
501 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
502 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
503 ; SSE2-NEXT: subpd %xmm4, %xmm0
504 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
505 ; SSE2-NEXT: addpd %xmm5, %xmm0
506 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
507 ; SSE2-NEXT: subpd %xmm4, %xmm1
508 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
509 ; SSE2-NEXT: addpd %xmm1, %xmm5
510 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
511 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
512 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,7]
513 ; SSE2-NEXT: pand .LCPI14_2(%rip), %xmm2
514 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
515 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
516 ; SSE2-NEXT: subpd %xmm4, %xmm2
517 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
518 ; SSE2-NEXT: addpd %xmm2, %xmm1
519 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
520 ; SSE2-NEXT: subpd %xmm4, %xmm5
521 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
522 ; SSE2-NEXT: addpd %xmm5, %xmm2
523 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
526 ; AVX-LABEL: uitofp_4vf64_i16:
528 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
529 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
531 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
532 %cvt = uitofp <4 x i16> %shuf to <4 x double>
533 ret <4 x double> %cvt
536 define <4 x double> @uitofp_4vf64_i8(<16 x i8> %a) {
537 ; SSE2-LABEL: uitofp_4vf64_i8:
539 ; SSE2-NEXT: movdqa %xmm0, %xmm1
540 ; SSE2-NEXT: pxor %xmm2, %xmm2
541 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
542 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
543 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
544 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
545 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
546 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
547 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
548 ; SSE2-NEXT: subpd %xmm3, %xmm0
549 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
550 ; SSE2-NEXT: addpd %xmm5, %xmm0
551 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
552 ; SSE2-NEXT: subpd %xmm3, %xmm4
553 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
554 ; SSE2-NEXT: addpd %xmm4, %xmm5
555 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
556 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
557 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
558 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
559 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,5,6,7]
560 ; SSE2-NEXT: pand .LCPI15_2(%rip), %xmm4
561 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
562 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
563 ; SSE2-NEXT: subpd %xmm3, %xmm4
564 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
565 ; SSE2-NEXT: addpd %xmm4, %xmm1
566 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
567 ; SSE2-NEXT: subpd %xmm3, %xmm5
568 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
569 ; SSE2-NEXT: addpd %xmm5, %xmm2
570 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
573 ; AVX-LABEL: uitofp_4vf64_i8:
575 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
576 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
578 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
579 %cvt = uitofp <4 x i8> %shuf to <4 x double>
580 ret <4 x double> %cvt
584 ; Signed Integer to Float
587 define <4 x float> @sitofp_4vf32(<4 x i32> %a) {
588 ; SSE2-LABEL: sitofp_4vf32:
590 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
593 ; AVX-LABEL: sitofp_4vf32:
595 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
597 %cvt = sitofp <4 x i32> %a to <4 x float>
601 define <4 x float> @sitofp_4vf32_i64(<2 x i64> %a) {
602 ; SSE2-LABEL: sitofp_4vf32_i64:
604 ; SSE2-NEXT: movd %xmm0, %rax
605 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
606 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
607 ; SSE2-NEXT: movd %xmm0, %rax
608 ; SSE2-NEXT: xorps %xmm0, %xmm0
609 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
610 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
611 ; SSE2-NEXT: movaps %xmm1, %xmm0
614 ; AVX-LABEL: sitofp_4vf32_i64:
616 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
617 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
618 ; AVX-NEXT: vmovq %xmm0, %rax
619 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
620 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
621 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
622 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
623 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
624 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
626 %cvt = sitofp <2 x i64> %a to <2 x float>
627 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
631 define <4 x float> @sitofp_4vf32_i16(<8 x i16> %a) {
632 ; SSE2-LABEL: sitofp_4vf32_i16:
634 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
635 ; SSE2-NEXT: psrad $16, %xmm0
636 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
639 ; AVX-LABEL: sitofp_4vf32_i16:
641 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
642 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
644 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
645 %cvt = sitofp <4 x i16> %shuf to <4 x float>
649 define <4 x float> @sitofp_4vf32_i8(<16 x i8> %a) {
650 ; SSE2-LABEL: sitofp_4vf32_i8:
652 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
653 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
654 ; SSE2-NEXT: psrad $24, %xmm0
655 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
658 ; AVX-LABEL: sitofp_4vf32_i8:
660 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
661 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
663 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
664 %cvt = sitofp <4 x i8> %shuf to <4 x float>
668 define <8 x float> @sitofp_8vf32(<8 x i32> %a) {
669 ; SSE2-LABEL: sitofp_8vf32:
671 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
672 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
675 ; AVX-LABEL: sitofp_8vf32:
677 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
679 %cvt = sitofp <8 x i32> %a to <8 x float>
683 define <4 x float> @sitofp_4vf32_4i64(<4 x i64> %a) {
684 ; SSE2-LABEL: sitofp_4vf32_4i64:
686 ; SSE2-NEXT: movd %xmm1, %rax
687 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm3
688 ; SSE2-NEXT: movd %xmm0, %rax
689 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm2
690 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
691 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
692 ; SSE2-NEXT: movd %xmm1, %rax
693 ; SSE2-NEXT: xorps %xmm1, %xmm1
694 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
695 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
696 ; SSE2-NEXT: movd %xmm0, %rax
697 ; SSE2-NEXT: xorps %xmm0, %xmm0
698 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
699 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
700 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
701 ; SSE2-NEXT: movaps %xmm2, %xmm0
704 ; AVX1-LABEL: sitofp_4vf32_4i64:
706 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
707 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
708 ; AVX1-NEXT: vmovq %xmm0, %rax
709 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
710 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
711 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
712 ; AVX1-NEXT: vmovq %xmm0, %rax
713 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
714 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
715 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
716 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
717 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
718 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
719 ; AVX1-NEXT: vzeroupper
722 ; AVX2-LABEL: sitofp_4vf32_4i64:
724 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
725 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
726 ; AVX2-NEXT: vmovq %xmm0, %rax
727 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
728 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
729 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
730 ; AVX2-NEXT: vmovq %xmm0, %rax
731 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
732 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
733 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
734 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
735 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
736 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
737 ; AVX2-NEXT: vzeroupper
739 %cvt = sitofp <4 x i64> %a to <4 x float>
743 define <8 x float> @sitofp_8vf32_i16(<8 x i16> %a) {
744 ; SSE2-LABEL: sitofp_8vf32_i16:
746 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
747 ; SSE2-NEXT: psrad $16, %xmm1
748 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2
749 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
750 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
751 ; SSE2-NEXT: psrad $16, %xmm0
752 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
753 ; SSE2-NEXT: movaps %xmm2, %xmm0
756 ; AVX1-LABEL: sitofp_8vf32_i16:
758 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
759 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
760 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
761 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
762 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
765 ; AVX2-LABEL: sitofp_8vf32_i16:
767 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
768 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
770 %cvt = sitofp <8 x i16> %a to <8 x float>
774 define <8 x float> @sitofp_8vf32_i8(<16 x i8> %a) {
775 ; SSE2-LABEL: sitofp_8vf32_i8:
777 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
778 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
779 ; SSE2-NEXT: psrad $24, %xmm1
780 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2
781 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
782 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
783 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
784 ; SSE2-NEXT: psrad $24, %xmm0
785 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
786 ; SSE2-NEXT: movaps %xmm2, %xmm0
789 ; AVX1-LABEL: sitofp_8vf32_i8:
791 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
792 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
793 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
794 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
795 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
798 ; AVX2-LABEL: sitofp_8vf32_i8:
800 ; AVX2-NEXT: vpmovzxbd %xmm0, %ymm0
801 ; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
802 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
803 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
805 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
806 %cvt = sitofp <8 x i8> %shuf to <8 x float>
811 ; Unsigned Integer to Float
814 define <4 x float> @uitofp_4vf32(<4 x i32> %a) {
815 ; SSE2-LABEL: uitofp_4vf32:
817 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
818 ; SSE2-NEXT: pand %xmm0, %xmm1
819 ; SSE2-NEXT: por .LCPI24_1(%rip), %xmm1
820 ; SSE2-NEXT: psrld $16, %xmm0
821 ; SSE2-NEXT: por .LCPI24_2(%rip), %xmm0
822 ; SSE2-NEXT: addps .LCPI24_3(%rip), %xmm0
823 ; SSE2-NEXT: addps %xmm1, %xmm0
826 ; AVX1-LABEL: uitofp_4vf32:
828 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
829 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
830 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
831 ; AVX1-NEXT: vaddps .LCPI24_2(%rip), %xmm0, %xmm0
832 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
835 ; AVX2-LABEL: uitofp_4vf32:
837 ; AVX2-NEXT: vpbroadcastd .LCPI24_0(%rip), %xmm1
838 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
839 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
840 ; AVX2-NEXT: vpbroadcastd .LCPI24_1(%rip), %xmm2
841 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
842 ; AVX2-NEXT: vbroadcastss .LCPI24_2(%rip), %xmm2
843 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
844 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
846 %cvt = uitofp <4 x i32> %a to <4 x float>
850 define <4 x float> @uitofp_4vf32_i64(<2 x i64> %a) {
851 ; SSE2-LABEL: uitofp_4vf32_i64:
853 ; SSE2-NEXT: movdqa %xmm0, %xmm1
854 ; SSE2-NEXT: movd %xmm1, %rax
855 ; SSE2-NEXT: movl %eax, %ecx
856 ; SSE2-NEXT: andl $1, %ecx
857 ; SSE2-NEXT: testq %rax, %rax
858 ; SSE2-NEXT: js .LBB25_1
860 ; SSE2-NEXT: xorps %xmm0, %xmm0
861 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
862 ; SSE2-NEXT: jmp .LBB25_3
863 ; SSE2-NEXT: .LBB25_1:
864 ; SSE2-NEXT: shrq %rax
865 ; SSE2-NEXT: orq %rax, %rcx
866 ; SSE2-NEXT: xorps %xmm0, %xmm0
867 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm0
868 ; SSE2-NEXT: addss %xmm0, %xmm0
869 ; SSE2-NEXT: .LBB25_3:
870 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
871 ; SSE2-NEXT: movd %xmm1, %rax
872 ; SSE2-NEXT: movl %eax, %ecx
873 ; SSE2-NEXT: andl $1, %ecx
874 ; SSE2-NEXT: testq %rax, %rax
875 ; SSE2-NEXT: js .LBB25_4
877 ; SSE2-NEXT: xorps %xmm1, %xmm1
878 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
879 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
881 ; SSE2-NEXT: .LBB25_4:
882 ; SSE2-NEXT: shrq %rax
883 ; SSE2-NEXT: orq %rax, %rcx
884 ; SSE2-NEXT: xorps %xmm1, %xmm1
885 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm1
886 ; SSE2-NEXT: addss %xmm1, %xmm1
887 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
890 ; AVX-LABEL: uitofp_4vf32_i64:
892 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
893 ; AVX-NEXT: movl %eax, %ecx
894 ; AVX-NEXT: andl $1, %ecx
895 ; AVX-NEXT: testq %rax, %rax
896 ; AVX-NEXT: js .LBB25_1
898 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
899 ; AVX-NEXT: jmp .LBB25_3
900 ; AVX-NEXT: .LBB25_1:
901 ; AVX-NEXT: shrq %rax
902 ; AVX-NEXT: orq %rax, %rcx
903 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
904 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
905 ; AVX-NEXT: .LBB25_3:
906 ; AVX-NEXT: vmovq %xmm0, %rax
907 ; AVX-NEXT: movl %eax, %ecx
908 ; AVX-NEXT: andl $1, %ecx
909 ; AVX-NEXT: testq %rax, %rax
910 ; AVX-NEXT: js .LBB25_4
912 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
913 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
914 ; AVX-NEXT: jmp .LBB25_6
915 ; AVX-NEXT: .LBB25_4:
916 ; AVX-NEXT: shrq %rax
917 ; AVX-NEXT: orq %rax, %rcx
918 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
919 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
920 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
921 ; AVX-NEXT: .LBB25_6:
922 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
923 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
924 ; AVX-NEXT: testq %rax, %rax
925 ; AVX-NEXT: js .LBB25_8
927 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
928 ; AVX-NEXT: .LBB25_8:
929 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
930 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
932 %cvt = uitofp <2 x i64> %a to <2 x float>
933 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
937 define <4 x float> @uitofp_4vf32_i16(<8 x i16> %a) {
938 ; SSE2-LABEL: uitofp_4vf32_i16:
940 ; SSE2-NEXT: pxor %xmm1, %xmm1
941 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
942 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
945 ; AVX-LABEL: uitofp_4vf32_i16:
947 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
948 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
950 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
951 %cvt = uitofp <4 x i16> %shuf to <4 x float>
955 define <4 x float> @uitofp_4vf32_i8(<16 x i8> %a) {
956 ; SSE2-LABEL: uitofp_4vf32_i8:
958 ; SSE2-NEXT: pxor %xmm1, %xmm1
959 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
960 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
961 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
964 ; AVX-LABEL: uitofp_4vf32_i8:
966 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
967 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
969 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
970 %cvt = uitofp <4 x i8> %shuf to <4 x float>
974 define <8 x float> @uitofp_8vf32(<8 x i32> %a) {
975 ; SSE2-LABEL: uitofp_8vf32:
977 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
978 ; SSE2-NEXT: movdqa %xmm0, %xmm3
979 ; SSE2-NEXT: pand %xmm2, %xmm3
980 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
981 ; SSE2-NEXT: por %xmm4, %xmm3
982 ; SSE2-NEXT: psrld $16, %xmm0
983 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
984 ; SSE2-NEXT: por %xmm5, %xmm0
985 ; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
986 ; SSE2-NEXT: addps %xmm6, %xmm0
987 ; SSE2-NEXT: addps %xmm3, %xmm0
988 ; SSE2-NEXT: pand %xmm1, %xmm2
989 ; SSE2-NEXT: por %xmm4, %xmm2
990 ; SSE2-NEXT: psrld $16, %xmm1
991 ; SSE2-NEXT: por %xmm5, %xmm1
992 ; SSE2-NEXT: addps %xmm6, %xmm1
993 ; SSE2-NEXT: addps %xmm2, %xmm1
996 ; AVX1-LABEL: uitofp_8vf32:
998 ; AVX1-NEXT: vandps .LCPI28_0(%rip), %ymm0, %ymm1
999 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
1000 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
1001 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1002 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1003 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1004 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1005 ; AVX1-NEXT: vmulps .LCPI28_1(%rip), %ymm0, %ymm0
1006 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
1009 ; AVX2-LABEL: uitofp_8vf32:
1011 ; AVX2-NEXT: vpbroadcastd .LCPI28_0(%rip), %ymm1
1012 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1013 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
1014 ; AVX2-NEXT: vpbroadcastd .LCPI28_1(%rip), %ymm2
1015 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1016 ; AVX2-NEXT: vbroadcastss .LCPI28_2(%rip), %ymm2
1017 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
1018 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
1020 %cvt = uitofp <8 x i32> %a to <8 x float>
1021 ret <8 x float> %cvt
1024 define <4 x float> @uitofp_4vf32_4i64(<4 x i64> %a) {
1025 ; SSE2-LABEL: uitofp_4vf32_4i64:
1027 ; SSE2-NEXT: movd %xmm1, %rax
1028 ; SSE2-NEXT: movl %eax, %ecx
1029 ; SSE2-NEXT: andl $1, %ecx
1030 ; SSE2-NEXT: testq %rax, %rax
1031 ; SSE2-NEXT: js .LBB29_1
1032 ; SSE2-NEXT: # BB#2:
1033 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm3
1034 ; SSE2-NEXT: jmp .LBB29_3
1035 ; SSE2-NEXT: .LBB29_1:
1036 ; SSE2-NEXT: shrq %rax
1037 ; SSE2-NEXT: orq %rax, %rcx
1038 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm3
1039 ; SSE2-NEXT: addss %xmm3, %xmm3
1040 ; SSE2-NEXT: .LBB29_3:
1041 ; SSE2-NEXT: movd %xmm0, %rax
1042 ; SSE2-NEXT: movl %eax, %ecx
1043 ; SSE2-NEXT: andl $1, %ecx
1044 ; SSE2-NEXT: testq %rax, %rax
1045 ; SSE2-NEXT: js .LBB29_4
1046 ; SSE2-NEXT: # BB#5:
1047 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm2
1048 ; SSE2-NEXT: jmp .LBB29_6
1049 ; SSE2-NEXT: .LBB29_4:
1050 ; SSE2-NEXT: shrq %rax
1051 ; SSE2-NEXT: orq %rax, %rcx
1052 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm2
1053 ; SSE2-NEXT: addss %xmm2, %xmm2
1054 ; SSE2-NEXT: .LBB29_6:
1055 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1056 ; SSE2-NEXT: movd %xmm1, %rax
1057 ; SSE2-NEXT: movl %eax, %ecx
1058 ; SSE2-NEXT: andl $1, %ecx
1059 ; SSE2-NEXT: testq %rax, %rax
1060 ; SSE2-NEXT: js .LBB29_7
1061 ; SSE2-NEXT: # BB#8:
1062 ; SSE2-NEXT: xorps %xmm1, %xmm1
1063 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
1064 ; SSE2-NEXT: jmp .LBB29_9
1065 ; SSE2-NEXT: .LBB29_7:
1066 ; SSE2-NEXT: shrq %rax
1067 ; SSE2-NEXT: orq %rax, %rcx
1068 ; SSE2-NEXT: xorps %xmm1, %xmm1
1069 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm1
1070 ; SSE2-NEXT: addss %xmm1, %xmm1
1071 ; SSE2-NEXT: .LBB29_9:
1072 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1073 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1074 ; SSE2-NEXT: movd %xmm0, %rax
1075 ; SSE2-NEXT: movl %eax, %ecx
1076 ; SSE2-NEXT: andl $1, %ecx
1077 ; SSE2-NEXT: testq %rax, %rax
1078 ; SSE2-NEXT: js .LBB29_10
1079 ; SSE2-NEXT: # BB#11:
1080 ; SSE2-NEXT: xorps %xmm0, %xmm0
1081 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
1082 ; SSE2-NEXT: jmp .LBB29_12
1083 ; SSE2-NEXT: .LBB29_10:
1084 ; SSE2-NEXT: shrq %rax
1085 ; SSE2-NEXT: orq %rax, %rcx
1086 ; SSE2-NEXT: xorps %xmm0, %xmm0
1087 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm0
1088 ; SSE2-NEXT: addss %xmm0, %xmm0
1089 ; SSE2-NEXT: .LBB29_12:
1090 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1091 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1092 ; SSE2-NEXT: movaps %xmm2, %xmm0
1095 ; AVX1-LABEL: uitofp_4vf32_4i64:
1097 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1098 ; AVX1-NEXT: movl %eax, %ecx
1099 ; AVX1-NEXT: andl $1, %ecx
1100 ; AVX1-NEXT: testq %rax, %rax
1101 ; AVX1-NEXT: js .LBB29_1
1102 ; AVX1-NEXT: # BB#2:
1103 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1104 ; AVX1-NEXT: jmp .LBB29_3
1105 ; AVX1-NEXT: .LBB29_1:
1106 ; AVX1-NEXT: shrq %rax
1107 ; AVX1-NEXT: orq %rax, %rcx
1108 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1109 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
1110 ; AVX1-NEXT: .LBB29_3:
1111 ; AVX1-NEXT: vmovq %xmm0, %rax
1112 ; AVX1-NEXT: movl %eax, %ecx
1113 ; AVX1-NEXT: andl $1, %ecx
1114 ; AVX1-NEXT: testq %rax, %rax
1115 ; AVX1-NEXT: js .LBB29_4
1116 ; AVX1-NEXT: # BB#5:
1117 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1118 ; AVX1-NEXT: jmp .LBB29_6
1119 ; AVX1-NEXT: .LBB29_4:
1120 ; AVX1-NEXT: shrq %rax
1121 ; AVX1-NEXT: orq %rax, %rcx
1122 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1123 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1124 ; AVX1-NEXT: .LBB29_6:
1125 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1126 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1127 ; AVX1-NEXT: vmovq %xmm0, %rax
1128 ; AVX1-NEXT: movl %eax, %ecx
1129 ; AVX1-NEXT: andl $1, %ecx
1130 ; AVX1-NEXT: testq %rax, %rax
1131 ; AVX1-NEXT: js .LBB29_7
1132 ; AVX1-NEXT: # BB#8:
1133 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1134 ; AVX1-NEXT: jmp .LBB29_9
1135 ; AVX1-NEXT: .LBB29_7:
1136 ; AVX1-NEXT: shrq %rax
1137 ; AVX1-NEXT: orq %rax, %rcx
1138 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1139 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1140 ; AVX1-NEXT: .LBB29_9:
1141 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1142 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1143 ; AVX1-NEXT: movl %eax, %ecx
1144 ; AVX1-NEXT: andl $1, %ecx
1145 ; AVX1-NEXT: testq %rax, %rax
1146 ; AVX1-NEXT: js .LBB29_10
1147 ; AVX1-NEXT: # BB#11:
1148 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1149 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1150 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1151 ; AVX1-NEXT: vzeroupper
1153 ; AVX1-NEXT: .LBB29_10:
1154 ; AVX1-NEXT: shrq %rax
1155 ; AVX1-NEXT: orq %rax, %rcx
1156 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1157 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
1158 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1159 ; AVX1-NEXT: vzeroupper
1162 ; AVX2-LABEL: uitofp_4vf32_4i64:
1164 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1165 ; AVX2-NEXT: movl %eax, %ecx
1166 ; AVX2-NEXT: andl $1, %ecx
1167 ; AVX2-NEXT: testq %rax, %rax
1168 ; AVX2-NEXT: js .LBB29_1
1169 ; AVX2-NEXT: # BB#2:
1170 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1171 ; AVX2-NEXT: jmp .LBB29_3
1172 ; AVX2-NEXT: .LBB29_1:
1173 ; AVX2-NEXT: shrq %rax
1174 ; AVX2-NEXT: orq %rax, %rcx
1175 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1176 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
1177 ; AVX2-NEXT: .LBB29_3:
1178 ; AVX2-NEXT: vmovq %xmm0, %rax
1179 ; AVX2-NEXT: movl %eax, %ecx
1180 ; AVX2-NEXT: andl $1, %ecx
1181 ; AVX2-NEXT: testq %rax, %rax
1182 ; AVX2-NEXT: js .LBB29_4
1183 ; AVX2-NEXT: # BB#5:
1184 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1185 ; AVX2-NEXT: jmp .LBB29_6
1186 ; AVX2-NEXT: .LBB29_4:
1187 ; AVX2-NEXT: shrq %rax
1188 ; AVX2-NEXT: orq %rax, %rcx
1189 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1190 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1191 ; AVX2-NEXT: .LBB29_6:
1192 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1193 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1194 ; AVX2-NEXT: vmovq %xmm0, %rax
1195 ; AVX2-NEXT: movl %eax, %ecx
1196 ; AVX2-NEXT: andl $1, %ecx
1197 ; AVX2-NEXT: testq %rax, %rax
1198 ; AVX2-NEXT: js .LBB29_7
1199 ; AVX2-NEXT: # BB#8:
1200 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1201 ; AVX2-NEXT: jmp .LBB29_9
1202 ; AVX2-NEXT: .LBB29_7:
1203 ; AVX2-NEXT: shrq %rax
1204 ; AVX2-NEXT: orq %rax, %rcx
1205 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1206 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1207 ; AVX2-NEXT: .LBB29_9:
1208 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1209 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1210 ; AVX2-NEXT: movl %eax, %ecx
1211 ; AVX2-NEXT: andl $1, %ecx
1212 ; AVX2-NEXT: testq %rax, %rax
1213 ; AVX2-NEXT: js .LBB29_10
1214 ; AVX2-NEXT: # BB#11:
1215 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1216 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1217 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1218 ; AVX2-NEXT: vzeroupper
1220 ; AVX2-NEXT: .LBB29_10:
1221 ; AVX2-NEXT: shrq %rax
1222 ; AVX2-NEXT: orq %rax, %rcx
1223 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1224 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
1225 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1226 ; AVX2-NEXT: vzeroupper
1228 %cvt = uitofp <4 x i64> %a to <4 x float>
1229 ret <4 x float> %cvt
1232 define <8 x float> @uitofp_8vf32_i16(<8 x i16> %a) {
1233 ; SSE2-LABEL: uitofp_8vf32_i16:
1235 ; SSE2-NEXT: pxor %xmm1, %xmm1
1236 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1237 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1238 ; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
1239 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1240 ; SSE2-NEXT: pand .LCPI30_0(%rip), %xmm0
1241 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
1242 ; SSE2-NEXT: movaps %xmm2, %xmm0
1245 ; AVX1-LABEL: uitofp_8vf32_i16:
1247 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1248 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1249 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1250 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1251 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1254 ; AVX2-LABEL: uitofp_8vf32_i16:
1256 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1257 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1259 %cvt = uitofp <8 x i16> %a to <8 x float>
1260 ret <8 x float> %cvt
1263 define <8 x float> @uitofp_8vf32_i8(<16 x i8> %a) {
1264 ; SSE2-LABEL: uitofp_8vf32_i8:
1266 ; SSE2-NEXT: pxor %xmm1, %xmm1
1267 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1268 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1269 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1270 ; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
1271 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1272 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1273 ; SSE2-NEXT: pand .LCPI31_0(%rip), %xmm0
1274 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
1275 ; SSE2-NEXT: movaps %xmm2, %xmm0
1278 ; AVX1-LABEL: uitofp_8vf32_i8:
1280 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1281 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1282 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1283 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1284 ; AVX1-NEXT: vandps .LCPI31_0(%rip), %ymm0, %ymm0
1285 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1288 ; AVX2-LABEL: uitofp_8vf32_i8:
1290 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1291 ; AVX2-NEXT: vpbroadcastd .LCPI31_0(%rip), %ymm1
1292 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1293 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1295 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1296 %cvt = uitofp <8 x i8> %shuf to <8 x float>
1297 ret <8 x float> %cvt
1304 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
1305 define void @aggregate_sitofp_8f32_i16(%Arguments* nocapture readonly %a0) {
1306 ; SSE2-LABEL: aggregate_sitofp_8f32_i16:
1308 ; SSE2-NEXT: movq 24(%rdi), %rax
1309 ; SSE2-NEXT: movdqu 8(%rdi), %xmm0
1310 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1311 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1312 ; SSE2-NEXT: psrad $16, %xmm1
1313 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
1314 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1315 ; SSE2-NEXT: psrad $16, %xmm0
1316 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
1317 ; SSE2-NEXT: movaps %xmm0, (%rax)
1318 ; SSE2-NEXT: movaps %xmm1, 16(%rax)
1321 ; AVX1-LABEL: aggregate_sitofp_8f32_i16:
1323 ; AVX1-NEXT: movq 24(%rdi), %rax
1324 ; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0
1325 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
1326 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1327 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
1328 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1329 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1330 ; AVX1-NEXT: vmovaps %ymm0, (%rax)
1331 ; AVX1-NEXT: vzeroupper
1334 ; AVX2-LABEL: aggregate_sitofp_8f32_i16:
1336 ; AVX2-NEXT: movq 24(%rdi), %rax
1337 ; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
1338 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1339 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
1340 ; AVX2-NEXT: vzeroupper
1342 %1 = load %Arguments, %Arguments* %a0, align 1
1343 %2 = extractvalue %Arguments %1, 1
1344 %3 = extractvalue %Arguments %1, 2
1345 %4 = sitofp <8 x i16> %2 to <8 x float>
1346 store <8 x float> %4, <8 x float>* %3, align 32