1 ; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
2 ; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
3 ; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
5 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
6 target triple = "x86_64-apple-macosx10.8.0"
8 define void @test1(i16* nocapture %head) nounwind {
12 vector.body: ; preds = %vector.body, %vector.ph
13 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
14 %0 = getelementptr inbounds i16* %head, i64 %index
15 %1 = bitcast i16* %0 to <8 x i16>*
16 %2 = load <8 x i16>* %1, align 2
17 %3 = icmp slt <8 x i16> %2, zeroinitializer
18 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
19 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
20 store <8 x i16> %5, <8 x i16>* %1, align 2
21 %index.next = add i64 %index, 8
22 %6 = icmp eq i64 %index.next, 16384
23 br i1 %6, label %for.end, label %vector.body
25 for.end: ; preds = %vector.body
29 ; SSE2: psubusw LCPI0_0(%rip), %xmm0
32 ; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
35 ; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
38 define void @test2(i16* nocapture %head) nounwind {
42 vector.body: ; preds = %vector.body, %vector.ph
43 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
44 %0 = getelementptr inbounds i16* %head, i64 %index
45 %1 = bitcast i16* %0 to <8 x i16>*
46 %2 = load <8 x i16>* %1, align 2
47 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
48 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
49 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
50 store <8 x i16> %5, <8 x i16>* %1, align 2
51 %index.next = add i64 %index, 8
52 %6 = icmp eq i64 %index.next, 16384
53 br i1 %6, label %for.end, label %vector.body
55 for.end: ; preds = %vector.body
59 ; SSE2: psubusw LCPI1_0(%rip), %xmm0
62 ; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
65 ; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
68 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
70 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
71 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
74 vector.body: ; preds = %vector.body, %vector.ph
75 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
76 %1 = getelementptr inbounds i16* %head, i64 %index
77 %2 = bitcast i16* %1 to <8 x i16>*
78 %3 = load <8 x i16>* %2, align 2
79 %4 = icmp ult <8 x i16> %3, %broadcast15
80 %5 = sub <8 x i16> %3, %broadcast15
81 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
82 store <8 x i16> %6, <8 x i16>* %2, align 2
83 %index.next = add i64 %index, 8
84 %7 = icmp eq i64 %index.next, 16384
85 br i1 %7, label %for.end, label %vector.body
87 for.end: ; preds = %vector.body
91 ; SSE2: psubusw %xmm0, %xmm1
94 ; AVX1: vpsubusw %xmm0, %xmm1, %xmm1
97 ; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
100 define void @test4(i8* nocapture %head) nounwind {
102 br label %vector.body
104 vector.body: ; preds = %vector.body, %vector.ph
105 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
106 %0 = getelementptr inbounds i8* %head, i64 %index
107 %1 = bitcast i8* %0 to <16 x i8>*
108 %2 = load <16 x i8>* %1, align 1
109 %3 = icmp slt <16 x i8> %2, zeroinitializer
110 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
111 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
112 store <16 x i8> %5, <16 x i8>* %1, align 1
113 %index.next = add i64 %index, 16
114 %6 = icmp eq i64 %index.next, 16384
115 br i1 %6, label %for.end, label %vector.body
117 for.end: ; preds = %vector.body
121 ; SSE2: psubusb LCPI3_0(%rip), %xmm0
124 ; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
127 ; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
130 define void @test5(i8* nocapture %head) nounwind {
132 br label %vector.body
134 vector.body: ; preds = %vector.body, %vector.ph
135 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
136 %0 = getelementptr inbounds i8* %head, i64 %index
137 %1 = bitcast i8* %0 to <16 x i8>*
138 %2 = load <16 x i8>* %1, align 1
139 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
140 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
141 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
142 store <16 x i8> %5, <16 x i8>* %1, align 1
143 %index.next = add i64 %index, 16
144 %6 = icmp eq i64 %index.next, 16384
145 br i1 %6, label %for.end, label %vector.body
147 for.end: ; preds = %vector.body
151 ; SSE2: psubusb LCPI4_0(%rip), %xmm0
154 ; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
157 ; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
160 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
162 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
163 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
164 br label %vector.body
166 vector.body: ; preds = %vector.body, %vector.ph
167 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
168 %1 = getelementptr inbounds i8* %head, i64 %index
169 %2 = bitcast i8* %1 to <16 x i8>*
170 %3 = load <16 x i8>* %2, align 1
171 %4 = icmp ult <16 x i8> %3, %broadcast15
172 %5 = sub <16 x i8> %3, %broadcast15
173 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
174 store <16 x i8> %6, <16 x i8>* %2, align 1
175 %index.next = add i64 %index, 16
176 %7 = icmp eq i64 %index.next, 16384
177 br i1 %7, label %for.end, label %vector.body
179 for.end: ; preds = %vector.body
183 ; SSE2: psubusb %xmm0, %xmm1
186 ; AVX1: vpsubusb %xmm0, %xmm1, %xmm1
189 ; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
192 define void @test7(i16* nocapture %head) nounwind {
194 br label %vector.body
196 vector.body: ; preds = %vector.body, %vector.ph
197 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
198 %0 = getelementptr inbounds i16* %head, i64 %index
199 %1 = bitcast i16* %0 to <16 x i16>*
200 %2 = load <16 x i16>* %1, align 2
201 %3 = icmp slt <16 x i16> %2, zeroinitializer
202 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
203 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
204 store <16 x i16> %5, <16 x i16>* %1, align 2
205 %index.next = add i64 %index, 8
206 %6 = icmp eq i64 %index.next, 16384
207 br i1 %6, label %for.end, label %vector.body
209 for.end: ; preds = %vector.body
213 ; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
216 define void @test8(i16* nocapture %head) nounwind {
218 br label %vector.body
220 vector.body: ; preds = %vector.body, %vector.ph
221 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
222 %0 = getelementptr inbounds i16* %head, i64 %index
223 %1 = bitcast i16* %0 to <16 x i16>*
224 %2 = load <16 x i16>* %1, align 2
225 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
226 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
227 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
228 store <16 x i16> %5, <16 x i16>* %1, align 2
229 %index.next = add i64 %index, 8
230 %6 = icmp eq i64 %index.next, 16384
231 br i1 %6, label %for.end, label %vector.body
233 for.end: ; preds = %vector.body
237 ; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
240 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
242 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
243 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
244 br label %vector.body
246 vector.body: ; preds = %vector.body, %vector.ph
247 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
248 %1 = getelementptr inbounds i16* %head, i64 %index
249 %2 = bitcast i16* %1 to <16 x i16>*
250 %3 = load <16 x i16>* %2, align 2
251 %4 = icmp ult <16 x i16> %3, %broadcast15
252 %5 = sub <16 x i16> %3, %broadcast15
253 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
254 store <16 x i16> %6, <16 x i16>* %2, align 2
255 %index.next = add i64 %index, 8
256 %7 = icmp eq i64 %index.next, 16384
257 br i1 %7, label %for.end, label %vector.body
259 for.end: ; preds = %vector.body
264 ; AVX2: vpsubusw %ymm0, %ymm1, %ymm1
267 define void @test10(i8* nocapture %head) nounwind {
269 br label %vector.body
271 vector.body: ; preds = %vector.body, %vector.ph
272 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
273 %0 = getelementptr inbounds i8* %head, i64 %index
274 %1 = bitcast i8* %0 to <32 x i8>*
275 %2 = load <32 x i8>* %1, align 1
276 %3 = icmp slt <32 x i8> %2, zeroinitializer
277 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
278 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
279 store <32 x i8> %5, <32 x i8>* %1, align 1
280 %index.next = add i64 %index, 16
281 %6 = icmp eq i64 %index.next, 16384
282 br i1 %6, label %for.end, label %vector.body
284 for.end: ; preds = %vector.body
289 ; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
292 define void @test11(i8* nocapture %head) nounwind {
294 br label %vector.body
296 vector.body: ; preds = %vector.body, %vector.ph
297 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
298 %0 = getelementptr inbounds i8* %head, i64 %index
299 %1 = bitcast i8* %0 to <32 x i8>*
300 %2 = load <32 x i8>* %1, align 1
301 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
302 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
303 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
304 store <32 x i8> %5, <32 x i8>* %1, align 1
305 %index.next = add i64 %index, 16
306 %6 = icmp eq i64 %index.next, 16384
307 br i1 %6, label %for.end, label %vector.body
309 for.end: ; preds = %vector.body
313 ; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
316 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
318 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
319 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
320 br label %vector.body
322 vector.body: ; preds = %vector.body, %vector.ph
323 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
324 %1 = getelementptr inbounds i8* %head, i64 %index
325 %2 = bitcast i8* %1 to <32 x i8>*
326 %3 = load <32 x i8>* %2, align 1
327 %4 = icmp ult <32 x i8> %3, %broadcast15
328 %5 = sub <32 x i8> %3, %broadcast15
329 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
330 store <32 x i8> %6, <32 x i8>* %2, align 1
331 %index.next = add i64 %index, 16
332 %7 = icmp eq i64 %index.next, 16384
333 br i1 %7, label %for.end, label %vector.body
335 for.end: ; preds = %vector.body
339 ; AVX2: vpsubusb %ymm0, %ymm1, %ymm1