1 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
3 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6 define void @test1(i16* nocapture %head) nounwind {
8 ; SSE: ## BB#0: ## %vector.ph
9 ; SSE-NEXT: movdqu (%rdi), %xmm0
10 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
11 ; SSE-NEXT: movdqu %xmm0, (%rdi)
15 ; AVX: ## BB#0: ## %vector.ph
16 ; AVX-NEXT: vmovdqu (%rdi), %xmm0
17 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
18 ; AVX-NEXT: vmovdqu %xmm0, (%rdi)
21 %0 = getelementptr inbounds i16, i16* %head, i64 0
22 %1 = bitcast i16* %0 to <8 x i16>*
23 %2 = load <8 x i16>, <8 x i16>* %1, align 2
24 %3 = icmp slt <8 x i16> %2, zeroinitializer
25 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
26 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
27 store <8 x i16> %5, <8 x i16>* %1, align 2
31 define void @test2(i16* nocapture %head) nounwind {
33 ; SSE: ## BB#0: ## %vector.ph
34 ; SSE-NEXT: movdqu (%rdi), %xmm0
35 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
36 ; SSE-NEXT: movdqu %xmm0, (%rdi)
40 ; AVX: ## BB#0: ## %vector.ph
41 ; AVX-NEXT: vmovdqu (%rdi), %xmm0
42 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
43 ; AVX-NEXT: vmovdqu %xmm0, (%rdi)
46 %0 = getelementptr inbounds i16, i16* %head, i64 0
47 %1 = bitcast i16* %0 to <8 x i16>*
48 %2 = load <8 x i16>, <8 x i16>* %1, align 2
49 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
50 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
51 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
52 store <8 x i16> %5, <8 x i16>* %1, align 2
56 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
58 ; SSE2: ## BB#0: ## %vector.ph
59 ; SSE2-NEXT: movd %esi, %xmm0
60 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
61 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
62 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
63 ; SSE2-NEXT: movdqu (%rdi), %xmm1
64 ; SSE2-NEXT: psubusw %xmm0, %xmm1
65 ; SSE2-NEXT: movdqu %xmm1, (%rdi)
69 ; SSSE3: ## BB#0: ## %vector.ph
70 ; SSSE3-NEXT: movd %esi, %xmm0
71 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
72 ; SSSE3-NEXT: movdqu (%rdi), %xmm1
73 ; SSSE3-NEXT: psubusw %xmm0, %xmm1
74 ; SSSE3-NEXT: movdqu %xmm1, (%rdi)
78 ; AVX1: ## BB#0: ## %vector.ph
79 ; AVX1-NEXT: vmovd %esi, %xmm0
80 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
81 ; AVX1-NEXT: vmovdqu (%rdi), %xmm1
82 ; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
83 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
87 ; AVX2: ## BB#0: ## %vector.ph
88 ; AVX2-NEXT: vmovd %esi, %xmm0
89 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
90 ; AVX2-NEXT: vmovdqu (%rdi), %xmm1
91 ; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
92 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
95 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
96 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
97 %1 = getelementptr inbounds i16, i16* %head, i64 0
98 %2 = bitcast i16* %1 to <8 x i16>*
99 %3 = load <8 x i16>, <8 x i16>* %2, align 2
100 %4 = icmp ult <8 x i16> %3, %broadcast15
101 %5 = sub <8 x i16> %3, %broadcast15
102 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
103 store <8 x i16> %6, <8 x i16>* %2, align 2
107 define void @test4(i8* nocapture %head) nounwind {
109 ; SSE: ## BB#0: ## %vector.ph
110 ; SSE-NEXT: movdqu (%rdi), %xmm0
111 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
112 ; SSE-NEXT: movdqu %xmm0, (%rdi)
116 ; AVX: ## BB#0: ## %vector.ph
117 ; AVX-NEXT: vmovdqu (%rdi), %xmm0
118 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
119 ; AVX-NEXT: vmovdqu %xmm0, (%rdi)
122 %0 = getelementptr inbounds i8, i8* %head, i64 0
123 %1 = bitcast i8* %0 to <16 x i8>*
124 %2 = load <16 x i8>, <16 x i8>* %1, align 1
125 %3 = icmp slt <16 x i8> %2, zeroinitializer
126 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
127 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
128 store <16 x i8> %5, <16 x i8>* %1, align 1
132 define void @test5(i8* nocapture %head) nounwind {
134 ; SSE: ## BB#0: ## %vector.ph
135 ; SSE-NEXT: movdqu (%rdi), %xmm0
136 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
137 ; SSE-NEXT: movdqu %xmm0, (%rdi)
141 ; AVX: ## BB#0: ## %vector.ph
142 ; AVX-NEXT: vmovdqu (%rdi), %xmm0
143 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
144 ; AVX-NEXT: vmovdqu %xmm0, (%rdi)
147 %0 = getelementptr inbounds i8, i8* %head, i64 0
148 %1 = bitcast i8* %0 to <16 x i8>*
149 %2 = load <16 x i8>, <16 x i8>* %1, align 1
150 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
151 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
152 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
153 store <16 x i8> %5, <16 x i8>* %1, align 1
157 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
159 ; SSE2: ## BB#0: ## %vector.ph
160 ; SSE2-NEXT: movd %esi, %xmm0
161 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
162 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
163 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
164 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
165 ; SSE2-NEXT: movdqu (%rdi), %xmm1
166 ; SSE2-NEXT: psubusb %xmm0, %xmm1
167 ; SSE2-NEXT: movdqu %xmm1, (%rdi)
170 ; SSSE3-LABEL: test6:
171 ; SSSE3: ## BB#0: ## %vector.ph
172 ; SSSE3-NEXT: movd %esi, %xmm0
173 ; SSSE3-NEXT: pxor %xmm1, %xmm1
174 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
175 ; SSSE3-NEXT: movdqu (%rdi), %xmm1
176 ; SSSE3-NEXT: psubusb %xmm0, %xmm1
177 ; SSSE3-NEXT: movdqu %xmm1, (%rdi)
181 ; AVX1: ## BB#0: ## %vector.ph
182 ; AVX1-NEXT: vmovd %esi, %xmm0
183 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
184 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
185 ; AVX1-NEXT: vmovdqu (%rdi), %xmm1
186 ; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
187 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
191 ; AVX2: ## BB#0: ## %vector.ph
192 ; AVX2-NEXT: vmovd %esi, %xmm0
193 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
194 ; AVX2-NEXT: vmovdqu (%rdi), %xmm1
195 ; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
196 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
199 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
200 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
201 %1 = getelementptr inbounds i8, i8* %head, i64 0
202 %2 = bitcast i8* %1 to <16 x i8>*
203 %3 = load <16 x i8>, <16 x i8>* %2, align 1
204 %4 = icmp ult <16 x i8> %3, %broadcast15
205 %5 = sub <16 x i8> %3, %broadcast15
206 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
207 store <16 x i8> %6, <16 x i8>* %2, align 1
211 define void @test7(i16* nocapture %head) nounwind {
213 ; SSE: ## BB#0: ## %vector.ph
214 ; SSE-NEXT: movdqu (%rdi), %xmm0
215 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
216 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
217 ; SSE-NEXT: psubusw %xmm2, %xmm0
218 ; SSE-NEXT: psubusw %xmm2, %xmm1
219 ; SSE-NEXT: movdqu %xmm1, 16(%rdi)
220 ; SSE-NEXT: movdqu %xmm0, (%rdi)
224 ; AVX1: ## BB#0: ## %vector.ph
225 ; AVX1-NEXT: vmovups (%rdi), %ymm0
226 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
227 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
228 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
229 ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2
230 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
231 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
232 ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
233 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
234 ; AVX1-NEXT: vzeroupper
238 ; AVX2: ## BB#0: ## %vector.ph
239 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
240 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
241 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
242 ; AVX2-NEXT: vzeroupper
245 %0 = getelementptr inbounds i16, i16* %head, i64 0
246 %1 = bitcast i16* %0 to <16 x i16>*
247 %2 = load <16 x i16>, <16 x i16>* %1, align 2
248 %3 = icmp slt <16 x i16> %2, zeroinitializer
249 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
250 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
251 store <16 x i16> %5, <16 x i16>* %1, align 2
255 define void @test8(i16* nocapture %head) nounwind {
257 ; SSE: ## BB#0: ## %vector.ph
258 ; SSE-NEXT: movdqu (%rdi), %xmm0
259 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
260 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
261 ; SSE-NEXT: psubusw %xmm2, %xmm0
262 ; SSE-NEXT: psubusw %xmm2, %xmm1
263 ; SSE-NEXT: movdqu %xmm1, 16(%rdi)
264 ; SSE-NEXT: movdqu %xmm0, (%rdi)
268 ; AVX1: ## BB#0: ## %vector.ph
269 ; AVX1-NEXT: vmovups (%rdi), %ymm0
270 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
271 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
272 ; AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3
273 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
274 ; AVX1-NEXT: vpcmpgtw %xmm4, %xmm3, %xmm3
275 ; AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm2
276 ; AVX1-NEXT: vpcmpgtw %xmm4, %xmm2, %xmm2
277 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
278 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
279 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
280 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
281 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
282 ; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0
283 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
284 ; AVX1-NEXT: vzeroupper
288 ; AVX2: ## BB#0: ## %vector.ph
289 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
290 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
291 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
292 ; AVX2-NEXT: vzeroupper
295 %0 = getelementptr inbounds i16, i16* %head, i64 0
296 %1 = bitcast i16* %0 to <16 x i16>*
297 %2 = load <16 x i16>, <16 x i16>* %1, align 2
298 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
299 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
300 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
301 store <16 x i16> %5, <16 x i16>* %1, align 2
306 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
308 ; SSE2: ## BB#0: ## %vector.ph
309 ; SSE2-NEXT: movd %esi, %xmm0
310 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
311 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
312 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
313 ; SSE2-NEXT: movdqu (%rdi), %xmm1
314 ; SSE2-NEXT: movdqu 16(%rdi), %xmm2
315 ; SSE2-NEXT: psubusw %xmm0, %xmm1
316 ; SSE2-NEXT: psubusw %xmm0, %xmm2
317 ; SSE2-NEXT: movdqu %xmm2, 16(%rdi)
318 ; SSE2-NEXT: movdqu %xmm1, (%rdi)
321 ; SSSE3-LABEL: test9:
322 ; SSSE3: ## BB#0: ## %vector.ph
323 ; SSSE3-NEXT: movd %esi, %xmm0
324 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
325 ; SSSE3-NEXT: movdqu (%rdi), %xmm1
326 ; SSSE3-NEXT: movdqu 16(%rdi), %xmm2
327 ; SSSE3-NEXT: psubusw %xmm0, %xmm1
328 ; SSSE3-NEXT: psubusw %xmm0, %xmm2
329 ; SSSE3-NEXT: movdqu %xmm2, 16(%rdi)
330 ; SSSE3-NEXT: movdqu %xmm1, (%rdi)
334 ; AVX1: ## BB#0: ## %vector.ph
335 ; AVX1-NEXT: vmovups (%rdi), %ymm0
336 ; AVX1-NEXT: vmovd %esi, %xmm1
337 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
338 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
339 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3
340 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm4
341 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
342 ; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm4
343 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
344 ; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
345 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
346 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
347 ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
348 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
349 ; AVX1-NEXT: vzeroupper
353 ; AVX2: ## BB#0: ## %vector.ph
354 ; AVX2-NEXT: vmovd %esi, %xmm0
355 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
356 ; AVX2-NEXT: vmovdqu (%rdi), %ymm1
357 ; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0
358 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
359 ; AVX2-NEXT: vzeroupper
362 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
363 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
364 %1 = getelementptr inbounds i16, i16* %head, i64 0
365 %2 = bitcast i16* %1 to <16 x i16>*
366 %3 = load <16 x i16>, <16 x i16>* %2, align 2
367 %4 = icmp ult <16 x i16> %3, %broadcast15
368 %5 = sub <16 x i16> %3, %broadcast15
369 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
370 store <16 x i16> %6, <16 x i16>* %2, align 2
374 define void @test10(i8* nocapture %head) nounwind {
376 ; SSE: ## BB#0: ## %vector.ph
377 ; SSE-NEXT: movdqu (%rdi), %xmm0
378 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
379 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
380 ; SSE-NEXT: psubusb %xmm2, %xmm0
381 ; SSE-NEXT: psubusb %xmm2, %xmm1
382 ; SSE-NEXT: movdqu %xmm1, 16(%rdi)
383 ; SSE-NEXT: movdqu %xmm0, (%rdi)
386 ; AVX1-LABEL: test10:
387 ; AVX1: ## BB#0: ## %vector.ph
388 ; AVX1-NEXT: vmovups (%rdi), %ymm0
389 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
390 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
391 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
392 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2
393 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
394 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
395 ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
396 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
397 ; AVX1-NEXT: vzeroupper
400 ; AVX2-LABEL: test10:
401 ; AVX2: ## BB#0: ## %vector.ph
402 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
403 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
404 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
405 ; AVX2-NEXT: vzeroupper
408 %0 = getelementptr inbounds i8, i8* %head, i64 0
409 %1 = bitcast i8* %0 to <32 x i8>*
410 %2 = load <32 x i8>, <32 x i8>* %1, align 1
411 %3 = icmp slt <32 x i8> %2, zeroinitializer
412 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
413 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
414 store <32 x i8> %5, <32 x i8>* %1, align 1
419 define void @test11(i8* nocapture %head) nounwind {
421 ; SSE: ## BB#0: ## %vector.ph
422 ; SSE-NEXT: movdqu (%rdi), %xmm0
423 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
424 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
425 ; SSE-NEXT: psubusb %xmm2, %xmm0
426 ; SSE-NEXT: psubusb %xmm2, %xmm1
427 ; SSE-NEXT: movdqu %xmm1, 16(%rdi)
428 ; SSE-NEXT: movdqu %xmm0, (%rdi)
431 ; AVX1-LABEL: test11:
432 ; AVX1: ## BB#0: ## %vector.ph
433 ; AVX1-NEXT: vmovups (%rdi), %ymm0
434 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
435 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
436 ; AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3
437 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
438 ; AVX1-NEXT: vpcmpgtb %xmm4, %xmm3, %xmm3
439 ; AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm2
440 ; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm2
441 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
442 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
443 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
444 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
445 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
446 ; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0
447 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
448 ; AVX1-NEXT: vzeroupper
451 ; AVX2-LABEL: test11:
452 ; AVX2: ## BB#0: ## %vector.ph
453 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
454 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
455 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
456 ; AVX2-NEXT: vzeroupper
459 %0 = getelementptr inbounds i8, i8* %head, i64 0
460 %1 = bitcast i8* %0 to <32 x i8>*
461 %2 = load <32 x i8>, <32 x i8>* %1, align 1
462 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
463 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
464 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
465 store <32 x i8> %5, <32 x i8>* %1, align 1
469 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
470 ; SSE2-LABEL: test12:
471 ; SSE2: ## BB#0: ## %vector.ph
472 ; SSE2-NEXT: movd %esi, %xmm0
473 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
474 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
475 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
476 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
477 ; SSE2-NEXT: movdqu (%rdi), %xmm1
478 ; SSE2-NEXT: movdqu 16(%rdi), %xmm2
479 ; SSE2-NEXT: psubusb %xmm0, %xmm1
480 ; SSE2-NEXT: psubusb %xmm0, %xmm2
481 ; SSE2-NEXT: movdqu %xmm2, 16(%rdi)
482 ; SSE2-NEXT: movdqu %xmm1, (%rdi)
485 ; SSSE3-LABEL: test12:
486 ; SSSE3: ## BB#0: ## %vector.ph
487 ; SSSE3-NEXT: movd %esi, %xmm0
488 ; SSSE3-NEXT: pxor %xmm1, %xmm1
489 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
490 ; SSSE3-NEXT: movdqu (%rdi), %xmm1
491 ; SSSE3-NEXT: movdqu 16(%rdi), %xmm2
492 ; SSSE3-NEXT: psubusb %xmm0, %xmm1
493 ; SSSE3-NEXT: psubusb %xmm0, %xmm2
494 ; SSSE3-NEXT: movdqu %xmm2, 16(%rdi)
495 ; SSSE3-NEXT: movdqu %xmm1, (%rdi)
498 ; AVX1-LABEL: test12:
499 ; AVX1: ## BB#0: ## %vector.ph
500 ; AVX1-NEXT: vmovups (%rdi), %ymm0
501 ; AVX1-NEXT: vmovd %esi, %xmm1
502 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
503 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
504 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
505 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
506 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm4
507 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
508 ; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm4
509 ; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
510 ; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
511 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
512 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
513 ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
514 ; AVX1-NEXT: vmovups %ymm0, (%rdi)
515 ; AVX1-NEXT: vzeroupper
518 ; AVX2-LABEL: test12:
519 ; AVX2: ## BB#0: ## %vector.ph
520 ; AVX2-NEXT: vmovd %esi, %xmm0
521 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
522 ; AVX2-NEXT: vmovdqu (%rdi), %ymm1
523 ; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0
524 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
525 ; AVX2-NEXT: vzeroupper
528 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
529 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
530 %1 = getelementptr inbounds i8, i8* %head, i64 0
531 %2 = bitcast i8* %1 to <32 x i8>*
532 %3 = load <32 x i8>, <32 x i8>* %2, align 1
533 %4 = icmp ult <32 x i8> %3, %broadcast15
534 %5 = sub <32 x i8> %3, %broadcast15
535 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
536 store <32 x i8> %6, <32 x i8>* %2, align 1