1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
6 ; AVX1-LABEL: testv4i64:
8 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
9 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
10 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
11 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3
12 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
13 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
14 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
15 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
16 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
17 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
18 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
19 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
20 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
21 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
22 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
23 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
24 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
25 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
26 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
27 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
28 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
29 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
30 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
31 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
32 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
33 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
34 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
37 ; AVX2-LABEL: testv4i64:
39 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
40 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
41 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
42 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
43 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
44 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
45 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
46 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
47 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
48 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
49 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
50 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
51 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
52 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
54 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
58 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
59 ; AVX1-LABEL: testv4i64u:
61 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
62 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
63 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
64 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3
65 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
66 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
67 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
68 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
69 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
70 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
71 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
72 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
73 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
74 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
75 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
76 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
77 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
78 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
79 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
80 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
81 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
82 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
83 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
84 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
85 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
86 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
87 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
90 ; AVX2-LABEL: testv4i64u:
92 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
93 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
94 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
95 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
96 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
97 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
98 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
99 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
100 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
101 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
102 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
103 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
104 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
105 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
107 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
111 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
112 ; AVX1-LABEL: testv8i32:
114 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
115 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
116 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
117 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3
118 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
119 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
120 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
121 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
122 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
123 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
124 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
125 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
126 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
127 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
128 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
129 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
130 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
131 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
132 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
133 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
134 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
135 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
136 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
137 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
138 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
139 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
140 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
141 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
142 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
143 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
144 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
145 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
146 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
147 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
148 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
151 ; AVX2-LABEL: testv8i32:
153 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
154 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
155 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
156 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
157 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
158 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
159 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
160 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
161 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
162 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
163 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
164 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
165 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
166 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
167 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
168 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
169 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
170 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
172 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
176 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
177 ; AVX1-LABEL: testv8i32u:
179 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
180 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
181 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
182 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3
183 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
184 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
185 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
186 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
187 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
188 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
189 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
190 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
191 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
192 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
193 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
194 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
195 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
196 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
197 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
198 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
199 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
200 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
201 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
202 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
203 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
204 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
205 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
206 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
207 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
208 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
209 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
210 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
211 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
212 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
213 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
216 ; AVX2-LABEL: testv8i32u:
218 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
219 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
220 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
221 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
222 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
223 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
224 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
225 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
226 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
227 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
228 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
229 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
230 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
231 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
232 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
233 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
234 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
235 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
237 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
241 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
242 ; AVX1-LABEL: testv16i16:
244 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
245 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
246 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
247 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2
248 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
249 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
250 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
251 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
252 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
253 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
254 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
255 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
256 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
257 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
258 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
259 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
260 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
261 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
262 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
263 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
264 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
265 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1
266 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
267 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
268 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
269 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
270 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
271 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
272 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
273 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
274 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
277 ; AVX2-LABEL: testv16i16:
279 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
280 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
281 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
282 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
283 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
284 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
285 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
286 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
287 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
288 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
289 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
290 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
291 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
292 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
293 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
295 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
299 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
300 ; AVX1-LABEL: testv16i16u:
302 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
303 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
304 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
305 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2
306 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
307 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
308 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
309 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
310 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
311 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
312 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
313 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
314 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
315 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
316 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
317 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
318 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
319 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
320 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
321 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
322 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
323 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1
324 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
325 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
326 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
327 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
328 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
329 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
330 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
331 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
332 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
335 ; AVX2-LABEL: testv16i16u:
337 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
338 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
339 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
340 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
341 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
342 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
343 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
344 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
345 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
346 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
347 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
348 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
349 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
350 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
351 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
353 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
357 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
358 ; AVX1-LABEL: testv32i8:
360 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
361 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
362 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
363 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
364 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
365 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
366 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
367 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
368 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
369 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
370 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
371 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
372 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
373 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
374 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
375 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
376 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
377 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
378 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
379 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
380 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
381 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
382 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
383 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
384 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
387 ; AVX2-LABEL: testv32i8:
389 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
390 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
391 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
392 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
393 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
394 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
395 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
396 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
397 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
398 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
399 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
400 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
402 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
406 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
407 ; AVX1-LABEL: testv32i8u:
409 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
410 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
411 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
412 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
413 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
414 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
415 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
416 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
417 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
418 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
419 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
420 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
421 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
422 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
423 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
424 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
425 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
426 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
427 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
428 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
429 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
430 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
431 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
432 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
433 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
436 ; AVX2-LABEL: testv32i8u:
438 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
439 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
440 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
441 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
442 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
443 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
444 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
445 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
446 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
447 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
448 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
449 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
451 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
455 define <4 x i64> @foldv4i64() nounwind {
456 ; ALL-LABEL: foldv4i64:
458 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
460 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
464 define <4 x i64> @foldv4i64u() nounwind {
465 ; ALL-LABEL: foldv4i64u:
467 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
469 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
473 define <8 x i32> @foldv8i32() nounwind {
474 ; ALL-LABEL: foldv8i32:
476 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
478 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
482 define <8 x i32> @foldv8i32u() nounwind {
483 ; ALL-LABEL: foldv8i32u:
485 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
487 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
491 define <16 x i16> @foldv16i16() nounwind {
492 ; ALL-LABEL: foldv16i16:
494 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
496 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
500 define <16 x i16> @foldv16i16u() nounwind {
501 ; ALL-LABEL: foldv16i16u:
503 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
505 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
509 define <32 x i8> @foldv32i8() nounwind {
510 ; ALL-LABEL: foldv32i8:
512 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
514 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
518 define <32 x i8> @foldv32i8u() nounwind {
519 ; ALL-LABEL: foldv32i8u:
521 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
523 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
527 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
528 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
529 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
530 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)