1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7 define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
8 ; SSE2-LABEL: zext_16i8_to_8i16:
9 ; SSE2: # BB#0: # %entry
10 ; SSE2-NEXT: pxor %xmm1, %xmm1
11 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
14 ; SSSE3-LABEL: zext_16i8_to_8i16:
15 ; SSSE3: # BB#0: # %entry
16 ; SSSE3-NEXT: pxor %xmm1, %xmm1
17 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
20 ; SSE41-LABEL: zext_16i8_to_8i16:
21 ; SSE41: # BB#0: # %entry
22 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
25 ; AVX-LABEL: zext_16i8_to_8i16:
26 ; AVX: # BB#0: # %entry
27 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
30 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
31 %C = zext <8 x i8> %B to <8 x i16>
36 define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
37 ; SSE2-LABEL: zext_16i8_to_16i16:
38 ; SSE2: # BB#0: # %entry
39 ; SSE2-NEXT: movdqa %xmm0, %xmm1
40 ; SSE2-NEXT: pxor %xmm2, %xmm2
41 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
42 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
43 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
46 ; SSSE3-LABEL: zext_16i8_to_16i16:
47 ; SSSE3: # BB#0: # %entry
48 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
49 ; SSSE3-NEXT: pxor %xmm2, %xmm2
50 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
51 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
52 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
55 ; SSE41-LABEL: zext_16i8_to_16i16:
56 ; SSE41: # BB#0: # %entry
57 ; SSE41-NEXT: movdqa %xmm0, %xmm1
58 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
59 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
60 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
63 ; AVX1-LABEL: zext_16i8_to_16i16:
64 ; AVX1: # BB#0: # %entry
65 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
66 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
67 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
68 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
71 ; AVX2-LABEL: zext_16i8_to_16i16:
72 ; AVX2: # BB#0: # %entry
73 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
76 %B = zext <16 x i8> %A to <16 x i16>
80 define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
81 ; SSE2-LABEL: zext_16i8_to_4i32:
82 ; SSE2: # BB#0: # %entry
83 ; SSE2-NEXT: pxor %xmm1, %xmm1
84 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
85 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
88 ; SSSE3-LABEL: zext_16i8_to_4i32:
89 ; SSSE3: # BB#0: # %entry
90 ; SSSE3-NEXT: pxor %xmm1, %xmm1
91 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
92 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
95 ; SSE41-LABEL: zext_16i8_to_4i32:
96 ; SSE41: # BB#0: # %entry
97 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
100 ; AVX-LABEL: zext_16i8_to_4i32:
101 ; AVX: # BB#0: # %entry
102 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
105 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
106 %C = zext <4 x i8> %B to <4 x i32>
110 define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
111 ; SSE2-LABEL: zext_16i8_to_8i32:
112 ; SSE2: # BB#0: # %entry
113 ; SSE2-NEXT: movdqa %xmm0, %xmm1
114 ; SSE2-NEXT: pxor %xmm2, %xmm2
115 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
116 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
117 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
118 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
119 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
122 ; SSSE3-LABEL: zext_16i8_to_8i32:
123 ; SSSE3: # BB#0: # %entry
124 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
125 ; SSSE3-NEXT: pxor %xmm2, %xmm2
126 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
127 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
128 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
129 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
130 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
133 ; SSE41-LABEL: zext_16i8_to_8i32:
134 ; SSE41: # BB#0: # %entry
135 ; SSE41-NEXT: movdqa %xmm0, %xmm1
136 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
137 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
138 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
139 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
142 ; AVX1-LABEL: zext_16i8_to_8i32:
143 ; AVX1: # BB#0: # %entry
144 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
145 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
146 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
147 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
148 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
151 ; AVX2-LABEL: zext_16i8_to_8i32:
152 ; AVX2: # BB#0: # %entry
153 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
154 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
155 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
158 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
159 %C = zext <8 x i8> %B to <8 x i32>
163 define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
164 ; SSE2-LABEL: zext_16i8_to_2i64:
165 ; SSE2: # BB#0: # %entry
166 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
167 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
168 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
169 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
172 ; SSSE3-LABEL: zext_16i8_to_2i64:
173 ; SSSE3: # BB#0: # %entry
174 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
175 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
178 ; SSE41-LABEL: zext_16i8_to_2i64:
179 ; SSE41: # BB#0: # %entry
180 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
181 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
184 ; AVX-LABEL: zext_16i8_to_2i64:
185 ; AVX: # BB#0: # %entry
186 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
187 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
190 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
191 %C = zext <2 x i8> %B to <2 x i64>
195 define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
196 ; SSE2-LABEL: zext_16i8_to_4i64:
197 ; SSE2: # BB#0: # %entry
198 ; SSE2-NEXT: movdqa %xmm0, %xmm2
199 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
200 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
201 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
202 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255]
203 ; SSE2-NEXT: pand %xmm3, %xmm2
204 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
205 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
206 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
207 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,6,7]
208 ; SSE2-NEXT: pand %xmm3, %xmm1
209 ; SSE2-NEXT: movdqa %xmm2, %xmm0
212 ; SSSE3-LABEL: zext_16i8_to_4i64:
213 ; SSSE3: # BB#0: # %entry
214 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
215 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
216 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
217 ; SSSE3-NEXT: pand %xmm1, %xmm2
218 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,1,1,2,2,3,3,3,3,5,5,2,2,3,3]
219 ; SSSE3-NEXT: pand %xmm0, %xmm1
220 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
223 ; SSE41-LABEL: zext_16i8_to_4i64:
224 ; SSE41: # BB#0: # %entry
225 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
226 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
227 ; SSE41-NEXT: pand %xmm1, %xmm2
228 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,1,1,2,2,3,3,3,3,5,5,2,2,3,3]
229 ; SSE41-NEXT: pand %xmm0, %xmm1
230 ; SSE41-NEXT: movdqa %xmm2, %xmm0
233 ; AVX1-LABEL: zext_16i8_to_4i64:
234 ; AVX1: # BB#0: # %entry
235 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
236 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
237 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
238 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
239 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
242 ; AVX2-LABEL: zext_16i8_to_4i64:
243 ; AVX2: # BB#0: # %entry
244 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
245 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
246 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
249 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
250 %C = zext <4 x i8> %B to <4 x i64>
254 define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
255 ; SSE2-LABEL: zext_8i16_to_4i32:
256 ; SSE2: # BB#0: # %entry
257 ; SSE2-NEXT: pxor %xmm1, %xmm1
258 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
261 ; SSSE3-LABEL: zext_8i16_to_4i32:
262 ; SSSE3: # BB#0: # %entry
263 ; SSSE3-NEXT: pxor %xmm1, %xmm1
264 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
267 ; SSE41-LABEL: zext_8i16_to_4i32:
268 ; SSE41: # BB#0: # %entry
269 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
272 ; AVX-LABEL: zext_8i16_to_4i32:
273 ; AVX: # BB#0: # %entry
274 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
277 %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
278 %C = zext <4 x i16> %B to <4 x i32>
282 define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
283 ; SSE2-LABEL: zext_8i16_to_8i32:
284 ; SSE2: # BB#0: # %entry
285 ; SSE2-NEXT: movdqa %xmm0, %xmm1
286 ; SSE2-NEXT: pxor %xmm2, %xmm2
287 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
288 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
289 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
292 ; SSSE3-LABEL: zext_8i16_to_8i32:
293 ; SSSE3: # BB#0: # %entry
294 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
295 ; SSSE3-NEXT: pxor %xmm2, %xmm2
296 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
297 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
298 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
301 ; SSE41-LABEL: zext_8i16_to_8i32:
302 ; SSE41: # BB#0: # %entry
303 ; SSE41-NEXT: movdqa %xmm0, %xmm1
304 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
305 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
306 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
309 ; AVX1-LABEL: zext_8i16_to_8i32:
310 ; AVX1: # BB#0: # %entry
311 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
312 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
313 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
314 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
317 ; AVX2-LABEL: zext_8i16_to_8i32:
318 ; AVX2: # BB#0: # %entry
319 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
322 %B = zext <8 x i16> %A to <8 x i32>
326 define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
327 ; SSE2-LABEL: zext_8i16_to_2i64:
328 ; SSE2: # BB#0: # %entry
329 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
330 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
331 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
334 ; SSSE3-LABEL: zext_8i16_to_2i64:
335 ; SSSE3: # BB#0: # %entry
336 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
337 ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
338 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
341 ; SSE41-LABEL: zext_8i16_to_2i64:
342 ; SSE41: # BB#0: # %entry
343 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
344 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
347 ; AVX-LABEL: zext_8i16_to_2i64:
348 ; AVX: # BB#0: # %entry
349 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
350 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
353 %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
354 %C = zext <2 x i16> %B to <2 x i64>
358 define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
359 ; SSE2-LABEL: zext_8i16_to_4i64:
360 ; SSE2: # BB#0: # %entry
361 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3]
362 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,6,7]
363 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535]
364 ; SSE2-NEXT: pand %xmm3, %xmm2
365 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
366 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
367 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,6,7]
368 ; SSE2-NEXT: pand %xmm3, %xmm1
369 ; SSE2-NEXT: movdqa %xmm2, %xmm0
372 ; SSSE3-LABEL: zext_8i16_to_4i64:
373 ; SSSE3: # BB#0: # %entry
374 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
375 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
376 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
377 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
378 ; SSSE3-NEXT: pand %xmm2, %xmm1
379 ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
380 ; SSSE3-NEXT: pand %xmm2, %xmm0
383 ; SSE41-LABEL: zext_8i16_to_4i64:
384 ; SSE41: # BB#0: # %entry
385 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
386 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535]
387 ; SSE41-NEXT: pand %xmm1, %xmm2
388 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
389 ; SSE41-NEXT: pand %xmm0, %xmm1
390 ; SSE41-NEXT: movdqa %xmm2, %xmm0
393 ; AVX1-LABEL: zext_8i16_to_4i64:
394 ; AVX1: # BB#0: # %entry
395 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
396 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
397 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
398 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
399 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
402 ; AVX2-LABEL: zext_8i16_to_4i64:
403 ; AVX2: # BB#0: # %entry
404 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
405 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
406 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
409 %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
410 %C = zext <4 x i16> %B to <4 x i64>
414 define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
415 ; SSE2-LABEL: zext_4i32_to_2i64:
416 ; SSE2: # BB#0: # %entry
417 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
418 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
421 ; SSSE3-LABEL: zext_4i32_to_2i64:
422 ; SSSE3: # BB#0: # %entry
423 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
424 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
427 ; SSE41-LABEL: zext_4i32_to_2i64:
428 ; SSE41: # BB#0: # %entry
429 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
430 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
433 ; AVX-LABEL: zext_4i32_to_2i64:
434 ; AVX: # BB#0: # %entry
435 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
436 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
439 %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
440 %C = zext <2 x i32> %B to <2 x i64>
444 define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
445 ; SSE2-LABEL: zext_4i32_to_4i64:
446 ; SSE2: # BB#0: # %entry
447 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
448 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
449 ; SSE2-NEXT: pand %xmm3, %xmm2
450 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
451 ; SSE2-NEXT: pand %xmm3, %xmm1
452 ; SSE2-NEXT: movdqa %xmm2, %xmm0
455 ; SSSE3-LABEL: zext_4i32_to_4i64:
456 ; SSSE3: # BB#0: # %entry
457 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
458 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
459 ; SSSE3-NEXT: pand %xmm3, %xmm2
460 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
461 ; SSSE3-NEXT: pand %xmm3, %xmm1
462 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
465 ; SSE41-LABEL: zext_4i32_to_4i64:
466 ; SSE41: # BB#0: # %entry
467 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
468 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
469 ; SSE41-NEXT: pand %xmm3, %xmm2
470 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
471 ; SSE41-NEXT: pand %xmm3, %xmm1
472 ; SSE41-NEXT: movdqa %xmm2, %xmm0
475 ; AVX1-LABEL: zext_4i32_to_4i64:
476 ; AVX1: # BB#0: # %entry
477 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
478 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
479 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
480 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
483 ; AVX2-LABEL: zext_4i32_to_4i64:
484 ; AVX2: # BB#0: # %entry
485 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
488 %B = zext <4 x i32> %A to <4 x i64>
492 define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
493 ; SSE2-LABEL: zext_8i8_to_8i32:
494 ; SSE2: # BB#0: # %entry
495 ; SSE2-NEXT: movdqa %xmm0, %xmm2
496 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
497 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
498 ; SSE2-NEXT: pand %xmm1, %xmm2
499 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
500 ; SSE2-NEXT: pand %xmm0, %xmm1
501 ; SSE2-NEXT: movdqa %xmm2, %xmm0
504 ; SSSE3-LABEL: zext_8i8_to_8i32:
505 ; SSSE3: # BB#0: # %entry
506 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
507 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
508 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
509 ; SSSE3-NEXT: pand %xmm1, %xmm2
510 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
511 ; SSSE3-NEXT: pand %xmm0, %xmm1
512 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
515 ; SSE41-LABEL: zext_8i8_to_8i32:
516 ; SSE41: # BB#0: # %entry
517 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
518 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
519 ; SSE41-NEXT: pand %xmm1, %xmm2
520 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
521 ; SSE41-NEXT: pand %xmm0, %xmm1
522 ; SSE41-NEXT: movdqa %xmm2, %xmm0
525 ; AVX1-LABEL: zext_8i8_to_8i32:
526 ; AVX1: # BB#0: # %entry
527 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
528 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
529 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
530 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
533 ; AVX2-LABEL: zext_8i8_to_8i32:
534 ; AVX2: # BB#0: # %entry
535 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
536 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
537 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
540 %t = zext <8 x i8> %z to <8 x i32>
544 define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
545 ; SSE2-LABEL: load_zext_16i8_to_16i16:
546 ; SSE2: # BB#0: # %entry
547 ; SSE2-NEXT: movdqa (%rdi), %xmm1
548 ; SSE2-NEXT: pxor %xmm2, %xmm2
549 ; SSE2-NEXT: movdqa %xmm1, %xmm0
550 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
551 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
552 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
555 ; SSSE3-LABEL: load_zext_16i8_to_16i16:
556 ; SSSE3: # BB#0: # %entry
557 ; SSSE3-NEXT: movdqa (%rdi), %xmm1
558 ; SSSE3-NEXT: pxor %xmm2, %xmm2
559 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
560 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
561 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
562 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
565 ; SSE41-LABEL: load_zext_16i8_to_16i16:
566 ; SSE41: # BB#0: # %entry
567 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
568 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
571 ; AVX1-LABEL: load_zext_16i8_to_16i16:
572 ; AVX1: # BB#0: # %entry
573 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
574 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
575 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
578 ; AVX2-LABEL: load_zext_16i8_to_16i16:
579 ; AVX2: # BB#0: # %entry
580 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
583 %X = load <16 x i8>, <16 x i8>* %ptr
584 %Y = zext <16 x i8> %X to <16 x i16>
588 define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
589 ; SSE2-LABEL: load_zext_8i16_to_8i32:
590 ; SSE2: # BB#0: # %entry
591 ; SSE2-NEXT: movdqa (%rdi), %xmm1
592 ; SSE2-NEXT: pxor %xmm2, %xmm2
593 ; SSE2-NEXT: movdqa %xmm1, %xmm0
594 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
595 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
596 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
599 ; SSSE3-LABEL: load_zext_8i16_to_8i32:
600 ; SSSE3: # BB#0: # %entry
601 ; SSSE3-NEXT: movdqa (%rdi), %xmm1
602 ; SSSE3-NEXT: pxor %xmm2, %xmm2
603 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
604 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
605 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
606 ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
609 ; SSE41-LABEL: load_zext_8i16_to_8i32:
610 ; SSE41: # BB#0: # %entry
611 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
612 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
615 ; AVX1-LABEL: load_zext_8i16_to_8i32:
616 ; AVX1: # BB#0: # %entry
617 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
618 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
619 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
622 ; AVX2-LABEL: load_zext_8i16_to_8i32:
623 ; AVX2: # BB#0: # %entry
624 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
627 %X = load <8 x i16>, <8 x i16>* %ptr
628 %Y = zext <8 x i16> %X to <8 x i32>
632 define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
633 ; SSE2-LABEL: load_zext_4i32_to_4i64:
634 ; SSE2: # BB#0: # %entry
635 ; SSE2-NEXT: movdqa (%rdi), %xmm1
636 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
637 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
638 ; SSE2-NEXT: pand %xmm2, %xmm0
639 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
640 ; SSE2-NEXT: pand %xmm2, %xmm1
643 ; SSSE3-LABEL: load_zext_4i32_to_4i64:
644 ; SSSE3: # BB#0: # %entry
645 ; SSSE3-NEXT: movdqa (%rdi), %xmm1
646 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
647 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
648 ; SSSE3-NEXT: pand %xmm2, %xmm0
649 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
650 ; SSSE3-NEXT: pand %xmm2, %xmm1
653 ; SSE41-LABEL: load_zext_4i32_to_4i64:
654 ; SSE41: # BB#0: # %entry
655 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
656 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
659 ; AVX1-LABEL: load_zext_4i32_to_4i64:
660 ; AVX1: # BB#0: # %entry
661 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
662 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
663 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
666 ; AVX2-LABEL: load_zext_4i32_to_4i64:
667 ; AVX2: # BB#0: # %entry
668 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
671 %X = load <4 x i32>, <4 x i32>* %ptr
672 %Y = zext <4 x i32> %X to <4 x i64>
676 define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
677 ; SSE2-LABEL: shuf_zext_8i16_to_8i32:
678 ; SSE2: # BB#0: # %entry
679 ; SSE2-NEXT: movdqa %xmm0, %xmm1
680 ; SSE2-NEXT: pxor %xmm2, %xmm2
681 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
682 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
685 ; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
686 ; SSSE3: # BB#0: # %entry
687 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
688 ; SSSE3-NEXT: pxor %xmm2, %xmm2
689 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
690 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
693 ; SSE41-LABEL: shuf_zext_8i16_to_8i32:
694 ; SSE41: # BB#0: # %entry
695 ; SSE41-NEXT: movdqa %xmm0, %xmm1
696 ; SSE41-NEXT: pxor %xmm2, %xmm2
697 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
698 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
701 ; AVX1-LABEL: shuf_zext_8i16_to_8i32:
702 ; AVX1: # BB#0: # %entry
703 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
704 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
705 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
706 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
709 ; AVX2-LABEL: shuf_zext_8i16_to_8i32:
710 ; AVX2: # BB#0: # %entry
711 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
714 %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
715 %Z = bitcast <16 x i16> %B to <8 x i32>
719 define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
720 ; SSE2-LABEL: shuf_zext_4i32_to_4i64:
721 ; SSE2: # BB#0: # %entry
722 ; SSE2-NEXT: movdqa %xmm0, %xmm1
723 ; SSE2-NEXT: pxor %xmm2, %xmm2
724 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
725 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
728 ; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
729 ; SSSE3: # BB#0: # %entry
730 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
731 ; SSSE3-NEXT: pxor %xmm2, %xmm2
732 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
733 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
736 ; SSE41-LABEL: shuf_zext_4i32_to_4i64:
737 ; SSE41: # BB#0: # %entry
738 ; SSE41-NEXT: movdqa %xmm0, %xmm1
739 ; SSE41-NEXT: pxor %xmm2, %xmm2
740 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
741 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
744 ; AVX1-LABEL: shuf_zext_4i32_to_4i64:
745 ; AVX1: # BB#0: # %entry
746 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
747 ; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
748 ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
749 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
750 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
753 ; AVX2-LABEL: shuf_zext_4i32_to_4i64:
754 ; AVX2: # BB#0: # %entry
755 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
758 %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
759 %Z = bitcast <8 x i32> %B to <4 x i64>
763 define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
764 ; SSE2-LABEL: shuf_zext_8i8_to_8i32:
765 ; SSE2: # BB#0: # %entry
766 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
767 ; SSE2-NEXT: packuswb %xmm0, %xmm0
768 ; SSE2-NEXT: pxor %xmm1, %xmm1
769 ; SSE2-NEXT: movdqa %xmm0, %xmm2
770 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
771 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
772 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
773 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
774 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
775 ; SSE2-NEXT: pandn %xmm0, %xmm1
776 ; SSE2-NEXT: movdqa %xmm2, %xmm0
779 ; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
780 ; SSSE3: # BB#0: # %entry
781 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
782 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
783 ; SSSE3-NEXT: pxor %xmm2, %xmm2
784 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
785 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
786 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
787 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
790 ; SSE41-LABEL: shuf_zext_8i8_to_8i32:
791 ; SSE41: # BB#0: # %entry
792 ; SSE41-NEXT: movdqa %xmm0, %xmm1
793 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
794 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
795 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
798 ; AVX1-LABEL: shuf_zext_8i8_to_8i32:
799 ; AVX1: # BB#0: # %entry
800 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
801 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
802 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
803 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
806 ; AVX2-LABEL: shuf_zext_8i8_to_8i32:
807 ; AVX2: # BB#0: # %entry
808 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
809 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
812 %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
813 %Z = bitcast <32 x i8> %B to <8 x i32>