1 ; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSSE3
2 ; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
3 ; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
5 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
6 target triple = "x86_64-apple-macosx10.8.0"
8 define void @test1(i16* nocapture %head) nounwind {
10 %0 = getelementptr inbounds i16, i16* %head, i64 0
11 %1 = bitcast i16* %0 to <8 x i16>*
12 %2 = load <8 x i16>, <8 x i16>* %1, align 2
13 %3 = icmp slt <8 x i16> %2, zeroinitializer
14 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
15 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
16 store <8 x i16> %5, <8 x i16>* %1, align 2
21 ; SSSE3-NEXT: movdqu (%rdi), %xmm0
22 ; SSSE3-NEXT: psubusw LCPI0_0(%rip), %xmm0
23 ; SSSE3-NEXT: movdqu %xmm0, (%rdi)
28 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
29 ; AVX1-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
30 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
35 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
36 ; AVX2-NEXT: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
37 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
41 define void @test2(i16* nocapture %head) nounwind {
43 %0 = getelementptr inbounds i16, i16* %head, i64 0
44 %1 = bitcast i16* %0 to <8 x i16>*
45 %2 = load <8 x i16>, <8 x i16>* %1, align 2
46 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
47 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
48 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
49 store <8 x i16> %5, <8 x i16>* %1, align 2
54 ; SSSE3-NEXT: movdqu (%rdi), %xmm0
55 ; SSSE3-NEXT: psubusw LCPI1_0(%rip), %xmm0
56 ; SSSE3-NEXT: movdqu %xmm0, (%rdi)
61 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
62 ; AVX1-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
63 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
68 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
69 ; AVX2-NEXT: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
70 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
74 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
76 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
77 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
78 %1 = getelementptr inbounds i16, i16* %head, i64 0
79 %2 = bitcast i16* %1 to <8 x i16>*
80 %3 = load <8 x i16>, <8 x i16>* %2, align 2
81 %4 = icmp ult <8 x i16> %3, %broadcast15
82 %5 = sub <8 x i16> %3, %broadcast15
83 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
84 store <8 x i16> %6, <8 x i16>* %2, align 2
89 ; SSSE3-NEXT: movd %esi, %xmm0
90 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
91 ; SSSE3-NEXT: movdqu (%rdi), %xmm1
92 ; SSSE3-NEXT: psubusw %xmm0, %xmm1
93 ; SSSE3-NEXT: movdqu %xmm1, (%rdi)
98 ; AVX1-NEXT: vmovd %esi, %xmm0
99 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
100 ; AVX1-NEXT: vmovdqu (%rdi), %xmm1
101 ; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
102 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
107 ; AVX2-NEXT: vmovd %esi, %xmm0
108 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
109 ; AVX2-NEXT: vmovdqu (%rdi), %xmm1
110 ; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
111 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
115 define void @test4(i8* nocapture %head) nounwind {
117 %0 = getelementptr inbounds i8, i8* %head, i64 0
118 %1 = bitcast i8* %0 to <16 x i8>*
119 %2 = load <16 x i8>, <16 x i8>* %1, align 1
120 %3 = icmp slt <16 x i8> %2, zeroinitializer
121 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
122 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
123 store <16 x i8> %5, <16 x i8>* %1, align 1
128 ; SSSE3-NEXT: movdqu (%rdi), %xmm0
129 ; SSSE3-NEXT: psubusb LCPI3_0(%rip), %xmm0
130 ; SSSE3-NEXT: movdqu %xmm0, (%rdi)
135 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
136 ; AVX1-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
137 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
142 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
143 ; AVX2-NEXT: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
144 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
148 define void @test5(i8* nocapture %head) nounwind {
150 %0 = getelementptr inbounds i8, i8* %head, i64 0
151 %1 = bitcast i8* %0 to <16 x i8>*
152 %2 = load <16 x i8>, <16 x i8>* %1, align 1
153 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
154 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
155 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
156 store <16 x i8> %5, <16 x i8>* %1, align 1
161 ; SSSE3-NEXT: movdqu (%rdi), %xmm0
162 ; SSSE3-NEXT: psubusb LCPI4_0(%rip), %xmm0
163 ; SSSE3-NEXT: movdqu %xmm0, (%rdi)
168 ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
169 ; AVX1-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
170 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
175 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
176 ; AVX2-NEXT: vpsubusb LCPI4_0(%rip), %xmm0
177 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
181 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
183 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
184 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
185 %1 = getelementptr inbounds i8, i8* %head, i64 0
186 %2 = bitcast i8* %1 to <16 x i8>*
187 %3 = load <16 x i8>, <16 x i8>* %2, align 1
188 %4 = icmp ult <16 x i8> %3, %broadcast15
189 %5 = sub <16 x i8> %3, %broadcast15
190 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
191 store <16 x i8> %6, <16 x i8>* %2, align 1
196 ; SSSE3-NEXT: movd %esi, %xmm0
197 ; SSSE3-NEXT: pxor %xmm1, %xmm1
198 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
199 ; SSSE3-NEXT: movdqu (%rdi), %xmm1
200 ; SSSE3-NEXT: psubusb %xmm0, %xmm1
201 ; SSSE3-NEXT: movdqu %xmm1, (%rdi)
206 ; AVX1-NEXT: vmovd %esi, %xmm0
207 ; AVX1-NEXT: vpxor %xmm1, %xmm1
208 ; AVX1-NEXT: vpshufb %xmm1, %xmm0
209 ; AVX1-NEXT: vmovdqu (%rdi), %xmm1
210 ; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
211 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
216 ; AVX2-NEXT: vmovd %esi, %xmm0
217 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
218 ; AVX2-NEXT: vmovdqu (%rdi), %xmm1
219 ; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
220 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
224 define void @test7(i16* nocapture %head) nounwind {
226 %0 = getelementptr inbounds i16, i16* %head, i64 0
227 %1 = bitcast i16* %0 to <16 x i16>*
228 %2 = load <16 x i16>, <16 x i16>* %1, align 2
229 %3 = icmp slt <16 x i16> %2, zeroinitializer
230 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
231 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
232 store <16 x i16> %5, <16 x i16>* %1, align 2
237 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
238 ; AVX2-NEXT: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
239 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
240 ; AVX2-NEXT: vzeroupper
244 define void @test8(i16* nocapture %head) nounwind {
246 %0 = getelementptr inbounds i16, i16* %head, i64 0
247 %1 = bitcast i16* %0 to <16 x i16>*
248 %2 = load <16 x i16>, <16 x i16>* %1, align 2
249 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
250 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
251 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
252 store <16 x i16> %5, <16 x i16>* %1, align 2
257 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
258 ; AVX2-NEXT: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
259 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
260 ; AVX2-NEXT: vzeroupper
264 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
266 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
267 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
268 %1 = getelementptr inbounds i16, i16* %head, i64 0
269 %2 = bitcast i16* %1 to <16 x i16>*
270 %3 = load <16 x i16>, <16 x i16>* %2, align 2
271 %4 = icmp ult <16 x i16> %3, %broadcast15
272 %5 = sub <16 x i16> %3, %broadcast15
273 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
274 store <16 x i16> %6, <16 x i16>* %2, align 2
279 ; AVX2-NEXT: vmovd %esi, %xmm0
280 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
281 ; AVX2-NEXT: vmovdqu (%rdi), %ymm1
282 ; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0
283 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
284 ; AVX2-NEXT: vzeroupper
288 define void @test10(i8* nocapture %head) nounwind {
290 %0 = getelementptr inbounds i8, i8* %head, i64 0
291 %1 = bitcast i8* %0 to <32 x i8>*
292 %2 = load <32 x i8>, <32 x i8>* %1, align 1
293 %3 = icmp slt <32 x i8> %2, zeroinitializer
294 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
295 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
296 store <32 x i8> %5, <32 x i8>* %1, align 1
301 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
302 ; AVX2-NEXT: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
303 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
304 ; AVX2-NEXT: vzeroupper
308 define void @test11(i8* nocapture %head) nounwind {
310 %0 = getelementptr inbounds i8, i8* %head, i64 0
311 %1 = bitcast i8* %0 to <32 x i8>*
312 %2 = load <32 x i8>, <32 x i8>* %1, align 1
313 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
314 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
315 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
316 store <32 x i8> %5, <32 x i8>* %1, align 1
321 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0
322 ; AVX2-NEXT: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
323 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
324 ; AVX2-NEXT: vzeroupper
328 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
330 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
331 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
332 %1 = getelementptr inbounds i8, i8* %head, i64 0
333 %2 = bitcast i8* %1 to <32 x i8>*
334 %3 = load <32 x i8>, <32 x i8>* %2, align 1
335 %4 = icmp ult <32 x i8> %3, %broadcast15
336 %5 = sub <32 x i8> %3, %broadcast15
337 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
338 store <32 x i8> %6, <32 x i8>* %2, align 1
343 ; AVX2-NEXT: vmovd %esi, %xmm0
344 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
345 ; AVX2-NEXT: vmovdqu (%rdi), %ymm1
346 ; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0
347 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
348 ; AVX2-NEXT: vzeroupper