test/Transforms/LoopVectorize/reduction.ll

   1 ; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
   2
   3 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
   4 target triple = "x86_64-apple-macosx10.8.0"
   5
   6 ;CHECK-LABEL: @reduction_sum(
   7 ;CHECK: phi <4 x i32>
   8 ;CHECK: load <4 x i32>
   9 ;CHECK: add <4 x i32>
  10 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  11 ;CHECK: add <4 x i32>
  12 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  13 ;CHECK: add <4 x i32>
  14 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
  15 ;CHECK: ret i32
  16 define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
  17   %1 = icmp sgt i32 %n, 0
  18   br i1 %1, label %.lr.ph, label %._crit_edge
  19
  20 .lr.ph:                                           ; preds = %0, %.lr.ph
  21   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
  22   %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
  23   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
  24   %3 = load i32* %2, align 4
  25   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
  26   %5 = load i32* %4, align 4
  27   %6 = trunc i64 %indvars.iv to i32
  28   %7 = add i32 %sum.02, %6
  29   %8 = add i32 %7, %3
  30   %9 = add i32 %8, %5
  31   %indvars.iv.next = add i64 %indvars.iv, 1
  32   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  33   %exitcond = icmp eq i32 %lftr.wideiv, %n
  34   br i1 %exitcond, label %._crit_edge, label %.lr.ph
  35
  36 ._crit_edge:                                      ; preds = %.lr.ph, %0
  37   %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
  38   ret i32 %sum.0.lcssa
  39 }
  40
  41 ;CHECK-LABEL: @reduction_prod(
  42 ;CHECK: phi <4 x i32>
  43 ;CHECK: load <4 x i32>
  44 ;CHECK: mul <4 x i32>
  45 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  46 ;CHECK: mul <4 x i32>
  47 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  48 ;CHECK: mul <4 x i32>
  49 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
  50 ;CHECK: ret i32
  51 define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
  52   %1 = icmp sgt i32 %n, 0
  53   br i1 %1, label %.lr.ph, label %._crit_edge
  54
  55 .lr.ph:                                           ; preds = %0, %.lr.ph
  56   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
  57   %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
  58   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
  59   %3 = load i32* %2, align 4
  60   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
  61   %5 = load i32* %4, align 4
  62   %6 = trunc i64 %indvars.iv to i32
  63   %7 = mul i32 %prod.02, %6
  64   %8 = mul i32 %7, %3
  65   %9 = mul i32 %8, %5
  66   %indvars.iv.next = add i64 %indvars.iv, 1
  67   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  68   %exitcond = icmp eq i32 %lftr.wideiv, %n
  69   br i1 %exitcond, label %._crit_edge, label %.lr.ph
  70
  71 ._crit_edge:                                      ; preds = %.lr.ph, %0
  72   %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
  73   ret i32 %prod.0.lcssa
  74 }
  75
  76 ;CHECK-LABEL: @reduction_mix(
  77 ;CHECK: phi <4 x i32>
  78 ;CHECK: load <4 x i32>
  79 ;CHECK: mul nsw <4 x i32>
  80 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  81 ;CHECK: add <4 x i32>
  82 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  83 ;CHECK: add <4 x i32>
  84 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
  85 ;CHECK: ret i32
  86 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
  87   %1 = icmp sgt i32 %n, 0
  88   br i1 %1, label %.lr.ph, label %._crit_edge
  89
  90 .lr.ph:                                           ; preds = %0, %.lr.ph
  91   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
  92   %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
  93   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
  94   %3 = load i32* %2, align 4
  95   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
  96   %5 = load i32* %4, align 4
  97   %6 = mul nsw i32 %5, %3
  98   %7 = trunc i64 %indvars.iv to i32
  99   %8 = add i32 %sum.02, %7
 100   %9 = add i32 %8, %6
 101   %indvars.iv.next = add i64 %indvars.iv, 1
 102   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 103   %exitcond = icmp eq i32 %lftr.wideiv, %n
 104   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 105
 106 ._crit_edge:                                      ; preds = %.lr.ph, %0
 107   %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
 108   ret i32 %sum.0.lcssa
 109 }
 110
 111 ;CHECK-LABEL: @reduction_mul(
 112 ;CHECK: mul <4 x i32>
 113 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 114 ;CHECK: mul <4 x i32>
 115 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 116 ;CHECK: mul <4 x i32>
 117 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 118 ;CHECK: ret i32
 119 define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
 120   %1 = icmp sgt i32 %n, 0
 121   br i1 %1, label %.lr.ph, label %._crit_edge
 122
 123 .lr.ph:                                           ; preds = %0, %.lr.ph
 124   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
 125   %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
 126   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
 127   %3 = load i32* %2, align 4
 128   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
 129   %5 = load i32* %4, align 4
 130   %6 = trunc i64 %indvars.iv to i32
 131   %7 = add i32 %3, %6
 132   %8 = add i32 %7, %5
 133   %9 = mul i32 %8, %sum.02
 134   %indvars.iv.next = add i64 %indvars.iv, 1
 135   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 136   %exitcond = icmp eq i32 %lftr.wideiv, %n
 137   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 138
 139 ._crit_edge:                                      ; preds = %.lr.ph, %0
 140   %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
 141   ret i32 %sum.0.lcssa
 142 }
 143
 144 ;CHECK-LABEL: @start_at_non_zero(
 145 ;CHECK: phi <4 x i32>
 146 ;CHECK: <i32 120, i32 0, i32 0, i32 0>
 147 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 148 ;CHECK: add <4 x i32>
 149 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 150 ;CHECK: add <4 x i32>
 151 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 152 ;CHECK: ret i32
 153 define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
 154 entry:
 155   %cmp7 = icmp sgt i32 %n, 0
 156   br i1 %cmp7, label %for.body, label %for.end
 157
 158 for.body:                                         ; preds = %entry, %for.body
 159   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 160   %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
 161   %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
 162   %0 = load i32* %arrayidx, align 4
 163   %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
 164   %1 = load i32* %arrayidx2, align 4
 165   %mul = mul nsw i32 %1, %0
 166   %add = add nsw i32 %mul, %sum.09
 167   %indvars.iv.next = add i64 %indvars.iv, 1
 168   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 169   %exitcond = icmp eq i32 %lftr.wideiv, %n
 170   br i1 %exitcond, label %for.end, label %for.body
 171
 172 for.end:                                          ; preds = %for.body, %entry
 173   %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
 174   ret i32 %sum.0.lcssa
 175 }
 176
 177 ;CHECK-LABEL: @reduction_and(
 178 ;CHECK: and <4 x i32>
 179 ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
 180 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 181 ;CHECK: and <4 x i32>
 182 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 183 ;CHECK: and <4 x i32>
 184 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 185 ;CHECK: ret i32
 186 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 187 entry:
 188   %cmp7 = icmp sgt i32 %n, 0
 189   br i1 %cmp7, label %for.body, label %for.end
 190
 191 for.body:                                         ; preds = %entry, %for.body
 192   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 193   %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
 194   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
 195   %0 = load i32* %arrayidx, align 4
 196   %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
 197   %1 = load i32* %arrayidx2, align 4
 198   %add = add nsw i32 %1, %0
 199   %and = and i32 %add, %result.08
 200   %indvars.iv.next = add i64 %indvars.iv, 1
 201   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 202   %exitcond = icmp eq i32 %lftr.wideiv, %n
 203   br i1 %exitcond, label %for.end, label %for.body
 204
 205 for.end:                                          ; preds = %for.body, %entry
 206   %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
 207   ret i32 %result.0.lcssa
 208 }
 209
 210 ;CHECK-LABEL: @reduction_or(
 211 ;CHECK: or <4 x i32>
 212 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 213 ;CHECK: or <4 x i32>
 214 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 215 ;CHECK: or <4 x i32>
 216 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 217 ;CHECK: ret i32
 218 define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 219 entry:
 220   %cmp7 = icmp sgt i32 %n, 0
 221   br i1 %cmp7, label %for.body, label %for.end
 222
 223 for.body:                                         ; preds = %entry, %for.body
 224   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 225   %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
 226   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
 227   %0 = load i32* %arrayidx, align 4
 228   %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
 229   %1 = load i32* %arrayidx2, align 4
 230   %add = add nsw i32 %1, %0
 231   %or = or i32 %add, %result.08
 232   %indvars.iv.next = add i64 %indvars.iv, 1
 233   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 234   %exitcond = icmp eq i32 %lftr.wideiv, %n
 235   br i1 %exitcond, label %for.end, label %for.body
 236
 237 for.end:                                          ; preds = %for.body, %entry
 238   %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
 239   ret i32 %result.0.lcssa
 240 }
 241
 242 ;CHECK-LABEL: @reduction_xor(
 243 ;CHECK: xor <4 x i32>
 244 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 245 ;CHECK: xor <4 x i32>
 246 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 247 ;CHECK: xor <4 x i32>
 248 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 249 ;CHECK: ret i32
 250 define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 251 entry:
 252   %cmp7 = icmp sgt i32 %n, 0
 253   br i1 %cmp7, label %for.body, label %for.end
 254
 255 for.body:                                         ; preds = %entry, %for.body
 256   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 257   %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
 258   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
 259   %0 = load i32* %arrayidx, align 4
 260   %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
 261   %1 = load i32* %arrayidx2, align 4
 262   %add = add nsw i32 %1, %0
 263   %xor = xor i32 %add, %result.08
 264   %indvars.iv.next = add i64 %indvars.iv, 1
 265   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 266   %exitcond = icmp eq i32 %lftr.wideiv, %n
 267   br i1 %exitcond, label %for.end, label %for.body
 268
 269 for.end:                                          ; preds = %for.body, %entry
 270   %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
 271   ret i32 %result.0.lcssa
 272 }
 273
 274 ; In this code the subtracted variable is on the RHS and this is not an induction variable.
 275 ;CHECK-LABEL: @reduction_sub_rhs(
 276 ;CHECK-NOT: phi <4 x i32>
 277 ;CHECK-NOT: sub nsw <4 x i32>
 278 ;CHECK: ret i32
 279 define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
 280 entry:
 281   %cmp4 = icmp sgt i32 %n, 0
 282   br i1 %cmp4, label %for.body, label %for.end
 283
 284 for.body:                                         ; preds = %entry, %for.body
 285   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 286   %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
 287   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
 288   %0 = load i32* %arrayidx, align 4
 289   %sub = sub nsw i32 %0, %x.05
 290   %indvars.iv.next = add i64 %indvars.iv, 1
 291   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 292   %exitcond = icmp eq i32 %lftr.wideiv, %n
 293   br i1 %exitcond, label %for.end, label %for.body
 294
 295 for.end:                                          ; preds = %for.body, %entry
 296   %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
 297   ret i32 %x.0.lcssa
 298 }
 299
 300
 301 ; In this test the reduction variable is on the LHS and we can vectorize it.
 302 ;CHECK-LABEL: @reduction_sub_lhs(
 303 ;CHECK: phi <4 x i32>
 304 ;CHECK: sub nsw <4 x i32>
 305 ;CHECK: ret i32
 306 define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
 307 entry:
 308   %cmp4 = icmp sgt i32 %n, 0
 309   br i1 %cmp4, label %for.body, label %for.end
 310
 311 for.body:                                         ; preds = %entry, %for.body
 312   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 313   %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
 314   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
 315   %0 = load i32* %arrayidx, align 4
 316   %sub = sub nsw i32 %x.05, %0
 317   %indvars.iv.next = add i64 %indvars.iv, 1
 318   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 319   %exitcond = icmp eq i32 %lftr.wideiv, %n
 320   br i1 %exitcond, label %for.end, label %for.body
 321
 322 for.end:                                          ; preds = %for.body, %entry
 323   %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
 324   ret i32 %x.0.lcssa
 325 }
 326
 327 ; We can vectorize conditional reductions with multi-input phis.
 328 ; CHECK: reduction_conditional
 329 ; CHECK: fadd <4 x float>
 330
 331 define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
 332 entry:
 333   br label %for.body
 334
 335 for.body:
 336   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
 337   %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
 338   %arrayidx = getelementptr inbounds float* %A, i64 %indvars.iv
 339   %0 = load float* %arrayidx, align 4
 340   %arrayidx2 = getelementptr inbounds float* %B, i64 %indvars.iv
 341   %1 = load float* %arrayidx2, align 4
 342   %cmp3 = fcmp ogt float %0, %1
 343   br i1 %cmp3, label %if.then, label %for.inc
 344
 345 if.then:
 346   %cmp6 = fcmp ogt float %1, 1.000000e+00
 347   br i1 %cmp6, label %if.then8, label %if.else
 348
 349 if.then8:
 350   %add = fadd fast float %sum.033, %0
 351   br label %for.inc
 352
 353 if.else:
 354   %cmp14 = fcmp ogt float %0, 2.000000e+00
 355   br i1 %cmp14, label %if.then16, label %for.inc
 356
 357 if.then16:
 358   %add19 = fadd fast float %sum.033, %1
 359   br label %for.inc
 360
 361 for.inc:
 362   %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
 363   %indvars.iv.next = add i64 %indvars.iv, 1
 364   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 365   %exitcond = icmp ne i32 %lftr.wideiv, 128
 366   br i1 %exitcond, label %for.body, label %for.end
 367
 368 for.end:
 369   %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
 370   ret float %sum.1.lcssa
 371 }
 372
 373 ; We can't vectorize reductions with phi inputs from outside the reduction.
 374 ; CHECK: noreduction_phi
 375 ; CHECK-NOT: fadd <4 x float>
 376 define float @noreduction_phi(float* %A, float* %B, float* %C, float %S) {
 377 entry:
 378   br label %for.body
 379
 380 for.body:
 381   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
 382   %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
 383   %arrayidx = getelementptr inbounds float* %A, i64 %indvars.iv
 384   %0 = load float* %arrayidx, align 4
 385   %arrayidx2 = getelementptr inbounds float* %B, i64 %indvars.iv
 386   %1 = load float* %arrayidx2, align 4
 387   %cmp3 = fcmp ogt float %0, %1
 388   br i1 %cmp3, label %if.then, label %for.inc
 389
 390 if.then:
 391   %cmp6 = fcmp ogt float %1, 1.000000e+00
 392   br i1 %cmp6, label %if.then8, label %if.else
 393
 394 if.then8:
 395   %add = fadd fast float %sum.033, %0
 396   br label %for.inc
 397
 398 if.else:
 399   %cmp14 = fcmp ogt float %0, 2.000000e+00
 400   br i1 %cmp14, label %if.then16, label %for.inc
 401
 402 if.then16:
 403   %add19 = fadd fast float %sum.033, %1
 404   br label %for.inc
 405
 406 for.inc:
 407   %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ]
 408   %indvars.iv.next = add i64 %indvars.iv, 1
 409   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 410   %exitcond = icmp ne i32 %lftr.wideiv, 128
 411   br i1 %exitcond, label %for.body, label %for.end
 412
 413 for.end:
 414   %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
 415   ret float %sum.1.lcssa
 416 }
 417
 418 ; We can't vectorize reductions that feed another header PHI.
 419 ; CHECK: noredux_header_phi
 420 ; CHECK-NOT: fadd <4 x float>
 421
 422 define float @noredux_header_phi(float* %A, float* %B, float* %C, float %S)  {
 423 entry:
 424   br label %for.body
 425
 426 for.body:
 427   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 428   %sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ]
 429   %sum.08 = phi float [ %S, %entry ], [ %add, %for.body ]
 430   %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
 431   %0 = load float* %arrayidx, align 4
 432   %add = fadd fast float %sum.08, %0
 433   %add1 = fadd fast float %sum2.09, %add
 434   %indvars.iv.next = add i64 %indvars.iv, 1
 435   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 436   %exitcond = icmp ne i32 %lftr.wideiv, 128
 437   br i1 %exitcond, label %for.body, label %for.end
 438
 439 for.end:
 440   %add1.lcssa = phi float [ %add1, %for.body ]
 441   %add.lcssa = phi float [ %add, %for.body ]
 442   %add2 = fadd fast float %add.lcssa, %add1.lcssa
 443   ret float %add2
 444 }
 445
 446
 447 ; When vectorizing a reduction whose loop header phi value is used outside the
 448 ; loop special care must be taken. Otherwise, the reduced value feeding into the
 449 ; outside user misses a few iterations (VF-1) of the loop.
 450 ; PR16522
 451
 452 ; CHECK-LABEL: @phivalueredux(
 453 ; CHECK-NOT: x i32>
 454
 455 define i32 @phivalueredux(i32 %p) {
 456 entry:
 457   br label %for.body
 458
 459 for.body:
 460   %t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
 461   %p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ]
 462   %xor = xor i32 %p.addr.02, -1
 463   %inc = add nsw i32 %t.03, 1
 464   %exitcond = icmp eq i32 %inc, 16
 465   br i1 %exitcond, label %for.end, label %for.body
 466
 467 for.end:
 468   ret i32 %p.addr.02
 469 }
 470
 471 ; Don't vectorize a reduction value that is not the last in a reduction cyle. We
 472 ; would loose iterations (VF-1) on the operations after that use.
 473 ; PR17498
 474
 475 ; CHECK-LABEL: not_last_operation
 476 ; CHECK-NOT: x i32>
 477 define i32 @not_last_operation(i32 %p, i32 %val) {
 478 entry:
 479   %tobool = icmp eq i32 %p, 0
 480   br label %for.body
 481
 482 for.body:
 483   %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ]
 484   %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ]
 485   %0 = zext i1 %tobool to i32
 486   %inc4.1 = xor i32 %0, 1
 487   %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1
 488   %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1
 489   %inc6.1 = add nsw i32 %inc613.1, 1
 490   %exitcond.1 = icmp eq i32 %inc6.1, 22
 491   br i1 %exitcond.1, label %exit, label %for.body
 492
 493 exit:
 494   %inc.2 = add nsw i32 %inc511.1.inc4.1, 2
 495   ret i32 %inc.2
 496 }