1 ; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1
2 ; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2
3 ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
7 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
8 target triple = "x86_64-pc_linux"
12 ;void foo1(int *A, int *B, int *trigger) {
14 ; for (int i=0; i<10000; i++) {
15 ; if (trigger[i] < 100) {
16 ; A[i] = B[i] + trigger[i];
22 ;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
23 ;AVX2: call <8 x i32> @llvm.masked.load.v8i32
24 ;AVX2: add nsw <8 x i32>
25 ;AVX2: call void @llvm.masked.store.v8i32
29 ;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100, i32 100
30 ;AVX512: call <16 x i32> @llvm.masked.load.v16i32
31 ;AVX512: add nsw <16 x i32>
32 ;AVX512: call void @llvm.masked.store.v16i32
35 ; Function Attrs: nounwind uwtable
36 define void @foo1(i32* %A, i32* %B, i32* %trigger) {
38 %A.addr = alloca i32*, align 8
39 %B.addr = alloca i32*, align 8
40 %trigger.addr = alloca i32*, align 8
41 %i = alloca i32, align 4
42 store i32* %A, i32** %A.addr, align 8
43 store i32* %B, i32** %B.addr, align 8
44 store i32* %trigger, i32** %trigger.addr, align 8
45 store i32 0, i32* %i, align 4
48 for.cond: ; preds = %for.inc, %entry
49 %0 = load i32, i32* %i, align 4
50 %cmp = icmp slt i32 %0, 10000
51 br i1 %cmp, label %for.body, label %for.end
53 for.body: ; preds = %for.cond
54 %1 = load i32, i32* %i, align 4
55 %idxprom = sext i32 %1 to i64
56 %2 = load i32*, i32** %trigger.addr, align 8
57 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
58 %3 = load i32, i32* %arrayidx, align 4
59 %cmp1 = icmp slt i32 %3, 100
60 br i1 %cmp1, label %if.then, label %if.end
62 if.then: ; preds = %for.body
63 %4 = load i32, i32* %i, align 4
64 %idxprom2 = sext i32 %4 to i64
65 %5 = load i32*, i32** %B.addr, align 8
66 %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
67 %6 = load i32, i32* %arrayidx3, align 4
68 %7 = load i32, i32* %i, align 4
69 %idxprom4 = sext i32 %7 to i64
70 %8 = load i32*, i32** %trigger.addr, align 8
71 %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
72 %9 = load i32, i32* %arrayidx5, align 4
73 %add = add nsw i32 %6, %9
74 %10 = load i32, i32* %i, align 4
75 %idxprom6 = sext i32 %10 to i64
76 %11 = load i32*, i32** %A.addr, align 8
77 %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
78 store i32 %add, i32* %arrayidx7, align 4
81 if.end: ; preds = %if.then, %for.body
84 for.inc: ; preds = %if.end
85 %12 = load i32, i32* %i, align 4
86 %inc = add nsw i32 %12, 1
87 store i32 %inc, i32* %i, align 4
90 for.end: ; preds = %for.cond
96 ;void foo2(float *A, float *B, int *trigger) {
98 ; for (int i=0; i<10000; i++) {
99 ; if (trigger[i] < 100) {
100 ; A[i] = B[i] + trigger[i];
106 ;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
107 ;AVX2: call <8 x float> @llvm.masked.load.v8f32
108 ;AVX2: fadd <8 x float>
109 ;AVX2: call void @llvm.masked.store.v8f32
113 ;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100, i32 100
114 ;AVX512: call <16 x float> @llvm.masked.load.v16f32
115 ;AVX512: fadd <16 x float>
116 ;AVX512: call void @llvm.masked.store.v16f32
119 ; Function Attrs: nounwind uwtable
120 define void @foo2(float* %A, float* %B, i32* %trigger) {
122 %A.addr = alloca float*, align 8
123 %B.addr = alloca float*, align 8
124 %trigger.addr = alloca i32*, align 8
125 %i = alloca i32, align 4
126 store float* %A, float** %A.addr, align 8
127 store float* %B, float** %B.addr, align 8
128 store i32* %trigger, i32** %trigger.addr, align 8
129 store i32 0, i32* %i, align 4
132 for.cond: ; preds = %for.inc, %entry
133 %0 = load i32, i32* %i, align 4
134 %cmp = icmp slt i32 %0, 10000
135 br i1 %cmp, label %for.body, label %for.end
137 for.body: ; preds = %for.cond
138 %1 = load i32, i32* %i, align 4
139 %idxprom = sext i32 %1 to i64
140 %2 = load i32*, i32** %trigger.addr, align 8
141 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
142 %3 = load i32, i32* %arrayidx, align 4
143 %cmp1 = icmp slt i32 %3, 100
144 br i1 %cmp1, label %if.then, label %if.end
146 if.then: ; preds = %for.body
147 %4 = load i32, i32* %i, align 4
148 %idxprom2 = sext i32 %4 to i64
149 %5 = load float*, float** %B.addr, align 8
150 %arrayidx3 = getelementptr inbounds float, float* %5, i64 %idxprom2
151 %6 = load float, float* %arrayidx3, align 4
152 %7 = load i32, i32* %i, align 4
153 %idxprom4 = sext i32 %7 to i64
154 %8 = load i32*, i32** %trigger.addr, align 8
155 %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
156 %9 = load i32, i32* %arrayidx5, align 4
157 %conv = sitofp i32 %9 to float
158 %add = fadd float %6, %conv
159 %10 = load i32, i32* %i, align 4
160 %idxprom6 = sext i32 %10 to i64
161 %11 = load float*, float** %A.addr, align 8
162 %arrayidx7 = getelementptr inbounds float, float* %11, i64 %idxprom6
163 store float %add, float* %arrayidx7, align 4
166 if.end: ; preds = %if.then, %for.body
169 for.inc: ; preds = %if.end
170 %12 = load i32, i32* %i, align 4
171 %inc = add nsw i32 %12, 1
172 store i32 %inc, i32* %i, align 4
175 for.end: ; preds = %for.cond
181 ;void foo3(double *A, double *B, int *trigger) {
183 ; for (int i=0; i<10000; i++) {
184 ; if (trigger[i] < 100) {
185 ; A[i] = B[i] + trigger[i];
191 ;AVX2: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
192 ;AVX2: call <4 x double> @llvm.masked.load.v4f64
193 ;AVX2: sitofp <4 x i32> %wide.load to <4 x double>
194 ;AVX2: fadd <4 x double>
195 ;AVX2: call void @llvm.masked.store.v4f64
199 ;AVX512: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
200 ;AVX512: call <8 x double> @llvm.masked.load.v8f64
201 ;AVX512: sitofp <8 x i32> %wide.load to <8 x double>
202 ;AVX512: fadd <8 x double>
203 ;AVX512: call void @llvm.masked.store.v8f64
207 ; Function Attrs: nounwind uwtable
208 define void @foo3(double* %A, double* %B, i32* %trigger) #0 {
210 %A.addr = alloca double*, align 8
211 %B.addr = alloca double*, align 8
212 %trigger.addr = alloca i32*, align 8
213 %i = alloca i32, align 4
214 store double* %A, double** %A.addr, align 8
215 store double* %B, double** %B.addr, align 8
216 store i32* %trigger, i32** %trigger.addr, align 8
217 store i32 0, i32* %i, align 4
220 for.cond: ; preds = %for.inc, %entry
221 %0 = load i32, i32* %i, align 4
222 %cmp = icmp slt i32 %0, 10000
223 br i1 %cmp, label %for.body, label %for.end
225 for.body: ; preds = %for.cond
226 %1 = load i32, i32* %i, align 4
227 %idxprom = sext i32 %1 to i64
228 %2 = load i32*, i32** %trigger.addr, align 8
229 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
230 %3 = load i32, i32* %arrayidx, align 4
231 %cmp1 = icmp slt i32 %3, 100
232 br i1 %cmp1, label %if.then, label %if.end
234 if.then: ; preds = %for.body
235 %4 = load i32, i32* %i, align 4
236 %idxprom2 = sext i32 %4 to i64
237 %5 = load double*, double** %B.addr, align 8
238 %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
239 %6 = load double, double* %arrayidx3, align 8
240 %7 = load i32, i32* %i, align 4
241 %idxprom4 = sext i32 %7 to i64
242 %8 = load i32*, i32** %trigger.addr, align 8
243 %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
244 %9 = load i32, i32* %arrayidx5, align 4
245 %conv = sitofp i32 %9 to double
246 %add = fadd double %6, %conv
247 %10 = load i32, i32* %i, align 4
248 %idxprom6 = sext i32 %10 to i64
249 %11 = load double*, double** %A.addr, align 8
250 %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
251 store double %add, double* %arrayidx7, align 8
254 if.end: ; preds = %if.then, %for.body
257 for.inc: ; preds = %if.end
258 %12 = load i32, i32* %i, align 4
259 %inc = add nsw i32 %12, 1
260 store i32 %inc, i32* %i, align 4
263 for.end: ; preds = %for.cond
269 ;void foo4(double *A, double *B, int *trigger) {
271 ; for (int i=0; i<10000; i++) {
272 ; if (trigger[i] < 100) {
273 ; A[i] = B[i*2] + trigger[i]; << non-cosecutive access
279 ;AVX2-NOT: llvm.masked
283 ;AVX512-NOT: llvm.masked
286 ; Function Attrs: nounwind uwtable
287 define void @foo4(double* %A, double* %B, i32* %trigger) {
289 %A.addr = alloca double*, align 8
290 %B.addr = alloca double*, align 8
291 %trigger.addr = alloca i32*, align 8
292 %i = alloca i32, align 4
293 store double* %A, double** %A.addr, align 8
294 store double* %B, double** %B.addr, align 8
295 store i32* %trigger, i32** %trigger.addr, align 8
296 store i32 0, i32* %i, align 4
299 for.cond: ; preds = %for.inc, %entry
300 %0 = load i32, i32* %i, align 4
301 %cmp = icmp slt i32 %0, 10000
302 br i1 %cmp, label %for.body, label %for.end
304 for.body: ; preds = %for.cond
305 %1 = load i32, i32* %i, align 4
306 %idxprom = sext i32 %1 to i64
307 %2 = load i32*, i32** %trigger.addr, align 8
308 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
309 %3 = load i32, i32* %arrayidx, align 4
310 %cmp1 = icmp slt i32 %3, 100
311 br i1 %cmp1, label %if.then, label %if.end
313 if.then: ; preds = %for.body
314 %4 = load i32, i32* %i, align 4
315 %mul = mul nsw i32 %4, 2
316 %idxprom2 = sext i32 %mul to i64
317 %5 = load double*, double** %B.addr, align 8
318 %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
319 %6 = load double, double* %arrayidx3, align 8
320 %7 = load i32, i32* %i, align 4
321 %idxprom4 = sext i32 %7 to i64
322 %8 = load i32*, i32** %trigger.addr, align 8
323 %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
324 %9 = load i32, i32* %arrayidx5, align 4
325 %conv = sitofp i32 %9 to double
326 %add = fadd double %6, %conv
327 %10 = load i32, i32* %i, align 4
328 %idxprom6 = sext i32 %10 to i64
329 %11 = load double*, double** %A.addr, align 8
330 %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
331 store double %add, double* %arrayidx7, align 8
334 if.end: ; preds = %if.then, %for.body
337 for.inc: ; preds = %if.end
338 %12 = load i32, i32* %i, align 4
339 %inc = add nsw i32 %12, 1
340 store i32 %inc, i32* %i, align 4
343 for.end: ; preds = %for.cond
347 @a = common global [1 x i32*] zeroinitializer, align 8
348 @c = common global i32* null, align 8
350 ; The loop here should not be vectorized due to trapping
351 ; constant expression
353 ;AVX2-NOT: llvm.masked
354 ;AVX2: store i32 sdiv
358 ;AVX512-NOT: llvm.masked
359 ;AVX512: store i32 sdiv
362 ; Function Attrs: nounwind uwtable
363 define void @foo5(i32* %A, i32* %B, i32* %trigger) {
365 %A.addr = alloca i32*, align 8
366 %B.addr = alloca i32*, align 8
367 %trigger.addr = alloca i32*, align 8
368 %i = alloca i32, align 4
369 store i32* %A, i32** %A.addr, align 8
370 store i32* %B, i32** %B.addr, align 8
371 store i32* %trigger, i32** %trigger.addr, align 8
372 store i32 0, i32* %i, align 4
375 for.cond: ; preds = %for.inc, %entry
376 %0 = load i32, i32* %i, align 4
377 %cmp = icmp slt i32 %0, 10000
378 br i1 %cmp, label %for.body, label %for.end
380 for.body: ; preds = %for.cond
381 %1 = load i32, i32* %i, align 4
382 %idxprom = sext i32 %1 to i64
383 %2 = load i32*, i32** %trigger.addr, align 8
384 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
385 %3 = load i32, i32* %arrayidx, align 4
386 %cmp1 = icmp slt i32 %3, 100
387 br i1 %cmp1, label %if.then, label %if.end
389 if.then: ; preds = %for.body
390 %4 = load i32, i32* %i, align 4
391 %idxprom2 = sext i32 %4 to i64
392 %5 = load i32*, i32** %B.addr, align 8
393 %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
394 %6 = load i32, i32* %arrayidx3, align 4
395 %7 = load i32, i32* %i, align 4
396 %idxprom4 = sext i32 %7 to i64
397 %8 = load i32*, i32** %trigger.addr, align 8
398 %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
399 %9 = load i32, i32* %arrayidx5, align 4
400 %add = add nsw i32 %6, %9
401 %10 = load i32, i32* %i, align 4
402 %idxprom6 = sext i32 %10 to i64
403 %11 = load i32*, i32** %A.addr, align 8
404 %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
405 store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
408 if.end: ; preds = %if.then, %for.body
411 for.inc: ; preds = %if.end
412 %12 = load i32, i32* %i, align 4
413 %inc = add nsw i32 %12, 1
414 store i32 %inc, i32* %i, align 4
417 for.end: ; preds = %for.cond
422 ;void foo6(double *in, double *out, unsigned size, int *trigger) {
424 ; for (int i=SIZE-1; i>=0; i--) {
425 ; if (trigger[i] > 0) {
426 ; out[i] = in[i] + (double) 0.5;
431 ;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
432 ;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
433 ;AVX2: call <4 x double> @llvm.masked.load.v4f64
434 ;AVX2: fadd <4 x double>
435 ;AVX2: call void @llvm.masked.store.v4f64
439 ;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
440 ;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
441 ;AVX512: call <8 x double> @llvm.masked.load.v8f64
442 ;AVX512: fadd <8 x double>
443 ;AVX512: call void @llvm.masked.store.v8f64
447 define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
449 %in.addr = alloca double*, align 8
450 %out.addr = alloca double*, align 8
451 %size.addr = alloca i32, align 4
452 %trigger.addr = alloca i32*, align 8
453 %i = alloca i32, align 4
454 store double* %in, double** %in.addr, align 8
455 store double* %out, double** %out.addr, align 8
456 store i32 %size, i32* %size.addr, align 4
457 store i32* %trigger, i32** %trigger.addr, align 8
458 store i32 4095, i32* %i, align 4
461 for.cond: ; preds = %for.inc, %entry
462 %0 = load i32, i32* %i, align 4
463 %cmp = icmp sge i32 %0, 0
464 br i1 %cmp, label %for.body, label %for.end
466 for.body: ; preds = %for.cond
467 %1 = load i32, i32* %i, align 4
468 %idxprom = sext i32 %1 to i64
469 %2 = load i32*, i32** %trigger.addr, align 8
470 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
471 %3 = load i32, i32* %arrayidx, align 4
472 %cmp1 = icmp sgt i32 %3, 0
473 br i1 %cmp1, label %if.then, label %if.end
475 if.then: ; preds = %for.body
476 %4 = load i32, i32* %i, align 4
477 %idxprom2 = sext i32 %4 to i64
478 %5 = load double*, double** %in.addr, align 8
479 %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
480 %6 = load double, double* %arrayidx3, align 8
481 %add = fadd double %6, 5.000000e-01
482 %7 = load i32, i32* %i, align 4
483 %idxprom4 = sext i32 %7 to i64
484 %8 = load double*, double** %out.addr, align 8
485 %arrayidx5 = getelementptr inbounds double, double* %8, i64 %idxprom4
486 store double %add, double* %arrayidx5, align 8
489 if.end: ; preds = %if.then, %for.body
492 for.inc: ; preds = %if.end
493 %9 = load i32, i32* %i, align 4
494 %dec = add nsw i32 %9, -1
495 store i32 %dec, i32* %i, align 4
498 for.end: ; preds = %for.cond