lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6 * Implement __builtin_trap (ISD::TRAP) as 'tw 31, 0, 0' aka 'trap'.
   7 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
   8
   9 ===-------------------------------------------------------------------------===
  10
  11 Support 'update' load/store instructions.  These are cracked on the G5, but are
  12 still a codesize win.
  13
  14 With preinc enabled, this:
  15
  16 long *%test4(long *%X, long *%dest) {
  17         %Y = getelementptr long* %X, int 4
  18         %A = load long* %Y
  19         store long %A, long* %dest
  20         ret long* %Y
  21 }
  22
  23 compiles to:
  24
  25 _test4:
  26         mr r2, r3
  27         lwzu r5, 32(r2)
  28         lwz r3, 36(r3)
  29         stw r5, 0(r4)
  30         stw r3, 4(r4)
  31         mr r3, r2
  32         blr
  33
  34 with -sched=list-burr, I get:
  35
  36 _test4:
  37         lwz r2, 36(r3)
  38         lwzu r5, 32(r3)
  39         stw r2, 4(r4)
  40         stw r5, 0(r4)
  41         blr
  42
  43 ===-------------------------------------------------------------------------===
  44
  45 We compile the hottest inner loop of viterbi to:
  46
  47         li r6, 0
  48         b LBB1_84       ;bb432.i
  49 LBB1_83:        ;bb420.i
  50         lbzx r8, r5, r7
  51         addi r6, r7, 1
  52         stbx r8, r4, r7
  53 LBB1_84:        ;bb432.i
  54         mr r7, r6
  55         cmplwi cr0, r7, 143
  56         bne cr0, LBB1_83        ;bb420.i
  57
  58 The CBE manages to produce:
  59
  60         li r0, 143
  61         mtctr r0
  62 loop:
  63         lbzx r2, r2, r11
  64         stbx r0, r2, r9
  65         addi r2, r2, 1
  66         bdz later
  67         b loop
  68
  69 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  70 produced this with bdnz, the loop would be a single dispatch group.
  71
  72 ===-------------------------------------------------------------------------===
  73
  74 Compile:
  75
  76 void foo(int *P) {
  77  if (P)  *P = 0;
  78 }
  79
  80 into:
  81
  82 _foo:
  83         cmpwi cr0,r3,0
  84         beqlr cr0
  85         li r0,0
  86         stw r0,0(r3)
  87         blr
  88
  89 This is effectively a simple form of predication.
  90
  91 ===-------------------------------------------------------------------------===
  92
  93 Lump the constant pool for each function into ONE pic object, and reference
  94 pieces of it as offsets from the start.  For functions like this (contrived
  95 to have lots of constants obviously):
  96
  97 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  98
  99 We generate:
 100
 101 _X:
 102         lis r2, ha16(.CPI_X_0)
 103         lfd f0, lo16(.CPI_X_0)(r2)
 104         lis r2, ha16(.CPI_X_1)
 105         lfd f2, lo16(.CPI_X_1)(r2)
 106         fmadd f0, f1, f0, f2
 107         lis r2, ha16(.CPI_X_2)
 108         lfd f1, lo16(.CPI_X_2)(r2)
 109         lis r2, ha16(.CPI_X_3)
 110         lfd f2, lo16(.CPI_X_3)(r2)
 111         fmadd f1, f0, f1, f2
 112         blr
 113
 114 It would be better to materialize .CPI_X into a register, then use immediates
 115 off of the register to avoid the lis's.  This is even more important in PIC
 116 mode.
 117
 118 Note that this (and the static variable version) is discussed here for GCC:
 119 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 120
 121 Here's another example (the sgn function):
 122 double testf(double a) {
 123        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 124 }
 125
 126 it produces a BB like this:
 127 LBB1_1: ; cond_true
 128         lis r2, ha16(LCPI1_0)
 129         lfs f0, lo16(LCPI1_0)(r2)
 130         lis r2, ha16(LCPI1_1)
 131         lis r3, ha16(LCPI1_2)
 132         lfs f2, lo16(LCPI1_2)(r3)
 133         lfs f3, lo16(LCPI1_1)(r2)
 134         fsub f0, f0, f1
 135         fsel f1, f0, f2, f3
 136         blr
 137
 138 ===-------------------------------------------------------------------------===
 139
 140 PIC Code Gen IPO optimization:
 141
 142 Squish small scalar globals together into a single global struct, allowing the
 143 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 144 of the GOT on targets with one).
 145
 146 Note that this is discussed here for GCC:
 147 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 148
 149 ===-------------------------------------------------------------------------===
 150
 151 Implement Newton-Rhapson method for improving estimate instructions to the
 152 correct accuracy, and implementing divide as multiply by reciprocal when it has
 153 more than one use.  Itanium will want this too.
 154
 155 ===-------------------------------------------------------------------------===
 156
 157 Compile offsets from allocas:
 158
 159 int *%test() {
 160         %X = alloca { int, int }
 161         %Y = getelementptr {int,int}* %X, int 0, uint 1
 162         ret int* %Y
 163 }
 164
 165 into a single add, not two:
 166
 167 _test:
 168         addi r2, r1, -8
 169         addi r3, r2, 4
 170         blr
 171
 172 --> important for C++.
 173
 174 ===-------------------------------------------------------------------------===
 175
 176 No loads or stores of the constants should be needed:
 177
 178 struct foo { double X, Y; };
 179 void xxx(struct foo F);
 180 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 181
 182 ===-------------------------------------------------------------------------===
 183
 184 Darwin Stub LICM optimization:
 185
 186 Loops like this:
 187
 188   for (...)  bar();
 189
 190 Have to go through an indirect stub if bar is external or linkonce.  It would
 191 be better to compile it as:
 192
 193      fp = &bar;
 194      for (...)  fp();
 195
 196 which only computes the address of bar once (instead of each time through the
 197 stub).  This is Darwin specific and would have to be done in the code generator.
 198 Probably not a win on x86.
 199
 200 ===-------------------------------------------------------------------------===
 201
 202 Simple IPO for argument passing, change:
 203   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 204
 205 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 206 of arguments get assigned to r3 through r10. That is, if you have a function
 207 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 208 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 209 order for functions we can internalize so that the maximum number of
 210 integers/pointers get passed in regs before you see any of the fp arguments.
 211
 212 Instead of implementing this, it would actually probably be easier to just
 213 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 214 including having this work sanely.
 215
 216 ===-------------------------------------------------------------------------===
 217
 218 Fix Darwin FP-In-Integer Registers ABI
 219
 220 Darwin passes doubles in structures in integer registers, which is very very
 221 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 222 that percolates these things out of functions.
 223
 224 Check out how horrible this is:
 225 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 226
 227 This is an extension of "interprocedural CC unmunging" that can't be done with
 228 just fastcc.
 229
 230 ===-------------------------------------------------------------------------===
 231
 232 Compile this:
 233
 234 int foo(int a) {
 235   int b = (a < 8);
 236   if (b) {
 237     return b * 3;     // ignore the fact that this is always 3.
 238   } else {
 239     return 2;
 240   }
 241 }
 242
 243 into something not this:
 244
 245 _foo:
 246 1)      cmpwi cr7, r3, 8
 247         mfcr r2, 1
 248         rlwinm r2, r2, 29, 31, 31
 249 1)      cmpwi cr0, r3, 7
 250         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 251 LBB1_1: ; then
 252         rlwinm r2, r2, 0, 31, 31
 253         mulli r3, r2, 3
 254         blr
 255 LBB1_2: ; UnifiedReturnBlock
 256         li r3, 2
 257         blr
 258
 259 In particular, the two compares (marked 1) could be shared by reversing one.
 260 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 261 same operands (but backwards) exists.  In this case, this wouldn't save us
 262 anything though, because the compares still wouldn't be shared.
 263
 264 ===-------------------------------------------------------------------------===
 265
 266 We should custom expand setcc instead of pretending that we have it.  That
 267 would allow us to expose the access of the crbit after the mfcr, allowing
 268 that access to be trivially folded into other ops.  A simple example:
 269
 270 int foo(int a, int b) { return (a < b) << 4; }
 271
 272 compiles into:
 273
 274 _foo:
 275         cmpw cr7, r3, r4
 276         mfcr r2, 1
 277         rlwinm r2, r2, 29, 31, 31
 278         slwi r3, r2, 4
 279         blr
 280
 281 ===-------------------------------------------------------------------------===
 282
 283 Fold add and sub with constant into non-extern, non-weak addresses so this:
 284
 285 static int a;
 286 void bar(int b) { a = b; }
 287 void foo(unsigned char *c) {
 288   *c = a;
 289 }
 290
 291 So that
 292
 293 _foo:
 294         lis r2, ha16(_a)
 295         la r2, lo16(_a)(r2)
 296         lbz r2, 3(r2)
 297         stb r2, 0(r3)
 298         blr
 299
 300 Becomes
 301
 302 _foo:
 303         lis r2, ha16(_a+3)
 304         lbz r2, lo16(_a+3)(r2)
 305         stb r2, 0(r3)
 306         blr
 307
 308 ===-------------------------------------------------------------------------===
 309
 310 We generate really bad code for this:
 311
 312 int f(signed char *a, _Bool b, _Bool c) {
 313    signed char t = 0;
 314   if (b)  t = *a;
 315   if (c)  *a = t;
 316 }
 317
 318 ===-------------------------------------------------------------------------===
 319
 320 This:
 321 int test(unsigned *P) { return *P >> 24; }
 322
 323 Should compile to:
 324
 325 _test:
 326         lbz r3,0(r3)
 327         blr
 328
 329 not:
 330
 331 _test:
 332         lwz r2, 0(r3)
 333         srwi r3, r2, 24
 334         blr
 335
 336 ===-------------------------------------------------------------------------===
 337
 338 On the G5, logical CR operations are more expensive in their three
 339 address form: ops that read/write the same register are half as expensive as
 340 those that read from two registers that are different from their destination.
 341
 342 We should model this with two separate instructions.  The isel should generate
 343 the "two address" form of the instructions.  When the register allocator
 344 detects that it needs to insert a copy due to the two-addresness of the CR
 345 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 346 we can convert to the "three address" instruction, to save code space.
 347
 348 This only matters when we start generating cr logical ops.
 349
 350 ===-------------------------------------------------------------------------===
 351
 352 We should compile these two functions to the same thing:
 353
 354 #include <stdlib.h>
 355 void f(int a, int b, int *P) {
 356   *P = (a-b)>=0?(a-b):(b-a);
 357 }
 358 void g(int a, int b, int *P) {
 359   *P = abs(a-b);
 360 }
 361
 362 Further, they should compile to something better than:
 363
 364 _g:
 365         subf r2, r4, r3
 366         subfic r3, r2, 0
 367         cmpwi cr0, r2, -1
 368         bgt cr0, LBB2_2 ; entry
 369 LBB2_1: ; entry
 370         mr r2, r3
 371 LBB2_2: ; entry
 372         stw r2, 0(r5)
 373         blr
 374
 375 GCC produces:
 376
 377 _g:
 378         subf r4,r4,r3
 379         srawi r2,r4,31
 380         xor r0,r2,r4
 381         subf r0,r2,r0
 382         stw r0,0(r5)
 383         blr
 384
 385 ... which is much nicer.
 386
 387 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 388
 389 ===-------------------------------------------------------------------------===
 390
 391 int foo(int N, int ***W, int **TK, int X) {
 392   int t, i;
 393
 394   for (t = 0; t < N; ++t)
 395     for (i = 0; i < 4; ++i)
 396       W[t / X][i][t % X] = TK[i][t];
 397
 398   return 5;
 399 }
 400
 401 We generate relatively atrocious code for this loop compared to gcc.
 402
 403 We could also strength reduce the rem and the div:
 404 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 405
 406 ===-------------------------------------------------------------------------===
 407
 408 float foo(float X) { return (int)(X); }
 409
 410 Currently produces:
 411
 412 _foo:
 413         fctiwz f0, f1
 414         stfd f0, -8(r1)
 415         lwz r2, -4(r1)
 416         extsw r2, r2
 417         std r2, -16(r1)
 418         lfd f0, -16(r1)
 419         fcfid f0, f0
 420         frsp f1, f0
 421         blr
 422
 423 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 424 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 425 win only.
 426
 427 ===-------------------------------------------------------------------------===
 428
 429 We generate ugly code for this:
 430
 431 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 432   unsigned code = 0;
 433   if(dx < -dw) code |= 1;
 434   if(dx > dw)  code |= 2;
 435   if(dy < -dw) code |= 4;
 436   if(dy > dw)  code |= 8;
 437   if(dz < -dw) code |= 16;
 438   if(dz > dw)  code |= 32;
 439   *ret = code;
 440 }
 441
 442 ===-------------------------------------------------------------------------===
 443
 444 Complete the signed i32 to FP conversion code using 64-bit registers
 445 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 446
 447      // FIXME: disable this lowered code.  This generates 64-bit register values,
 448      // and we don't model the fact that the top part is clobbered by calls.  We
 449      // need to flag these together so that the value isn't live across a call.
 450      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 451
 452 Also, if the registers are spilled to the stack, we have to ensure that all
 453 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 454 sounds like we need to get the 64-bit register classes going.
 455
 456 ===-------------------------------------------------------------------------===
 457
 458 %struct.B = type { i8, [3 x i8] }
 459
 460 define void @bar(%struct.B* %b) {
 461 entry:
 462         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 463         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 464         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 465         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 466         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 467         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 468         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 469         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 470         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 471         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 472         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 473         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 474         store i32 %tmp13, i32* %tmp8
 475         ret void
 476 }
 477
 478 We emit:
 479
 480 _foo:
 481         lwz r2, 0(r3)
 482         slwi r4, r2, 1
 483         or r4, r4, r2
 484         rlwimi r2, r4, 0, 0, 0
 485         stw r2, 0(r3)
 486         blr
 487
 488 We could collapse a bunch of those ORs and ANDs and generate the following
 489 equivalent code:
 490
 491 _foo:
 492         lwz r2, 0(r3)
 493         rlwinm r4, r2, 1, 0, 0
 494         or r2, r2, r4
 495         stw r2, 0(r3)
 496         blr
 497
 498 ===-------------------------------------------------------------------------===
 499
 500 We compile:
 501
 502 unsigned test6(unsigned x) {
 503   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 504 }
 505
 506 into:
 507
 508 _test6:
 509         lis r2, 255
 510         rlwinm r3, r3, 16, 0, 31
 511         ori r2, r2, 255
 512         and r3, r3, r2
 513         blr
 514
 515 GCC gets it down to:
 516
 517 _test6:
 518         rlwinm r0,r3,16,8,15
 519         rlwinm r3,r3,16,24,31
 520         or r3,r3,r0
 521         blr
 522
 523
 524 ===-------------------------------------------------------------------------===
 525
 526 Consider a function like this:
 527
 528 float foo(float X) { return X + 1234.4123f; }
 529
 530 The FP constant ends up in the constant pool, so we need to get the LR register.
 531  This ends up producing code like this:
 532
 533 _foo:
 534 .LBB_foo_0:     ; entry
 535         mflr r11
 536 ***     stw r11, 8(r1)
 537         bl "L00000$pb"
 538 "L00000$pb":
 539         mflr r2
 540         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 541         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 542         fadds f1, f1, f0
 543 ***     lwz r11, 8(r1)
 544         mtlr r11
 545         blr
 546
 547 This is functional, but there is no reason to spill the LR register all the way
 548 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 549
 550 Implementing this will require some codegen improvements.  Nate writes:
 551
 552 "So basically what we need to support the "no stack frame save and restore" is a
 553 generalization of the LR optimization to "callee-save regs".
 554
 555 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 556 that it's callee save, and spills it directly to the stack.
 557
 558 Ideally, something like this would happen:
 559
 560 LR would be in a separate register class from the GPRs. The class of LR would be
 561 marked "unspillable".  When the register allocator came across an unspillable
 562 reg, it would ask "what is the best class to copy this into that I *can* spill"
 563 If it gets a class back, which it will in this case (the gprs), it grabs a free
 564 register of that class.  If it is then later necessary to spill that reg, so be
 565 it.
 566
 567 ===-------------------------------------------------------------------------===
 568
 569 We compile this:
 570 int test(_Bool X) {
 571   return X ? 524288 : 0;
 572 }
 573
 574 to:
 575 _test:
 576         cmplwi cr0, r3, 0
 577         lis r2, 8
 578         li r3, 0
 579         beq cr0, LBB1_2 ;entry
 580 LBB1_1: ;entry
 581         mr r3, r2
 582 LBB1_2: ;entry
 583         blr
 584
 585 instead of:
 586 _test:
 587         addic r2,r3,-1
 588         subfe r0,r2,r3
 589         slwi r3,r0,19
 590         blr
 591
 592 This sort of thing occurs a lot due to globalopt.
 593
 594 ===-------------------------------------------------------------------------===
 595
 596 We currently compile 32-bit bswap:
 597
 598 declare i32 @llvm.bswap.i32(i32 %A)
 599 define i32 @test(i32 %A) {
 600         %B = call i32 @llvm.bswap.i32(i32 %A)
 601         ret i32 %B
 602 }
 603
 604 to:
 605
 606 _test:
 607         rlwinm r2, r3, 24, 16, 23
 608         slwi r4, r3, 24
 609         rlwimi r2, r3, 8, 24, 31
 610         rlwimi r4, r3, 8, 8, 15
 611         rlwimi r4, r2, 0, 16, 31
 612         mr r3, r4
 613         blr
 614
 615 it would be more efficient to produce:
 616
 617 _foo:   mr r0,r3
 618         rlwinm r3,r3,8,0xffffffff
 619         rlwimi r3,r0,24,0,7
 620         rlwimi r3,r0,24,16,23
 621         blr
 622
 623 ===-------------------------------------------------------------------------===
 624
 625 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
 626
 627 __ZNK4llvm5APInt17countLeadingZerosEv:
 628         ld r2, 0(r3)
 629         cntlzd r2, r2
 630         or r2, r2, r2     <<-- silly.
 631         addi r3, r2, -64
 632         blr
 633
 634 The dead or is a 'truncate' from 64- to 32-bits.
 635
 636 ===-------------------------------------------------------------------------===
 637
 638 We generate horrible ppc code for this:
 639
 640 #define N  2000000
 641 double   a[N],c[N];
 642 void simpleloop() {
 643    int j;
 644    for (j=0; j<N; j++)
 645      c[j] = a[j];
 646 }
 647
 648 LBB1_1: ;bb
 649         lfdx f0, r3, r4
 650         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 651         stfdx f0, r2, r4
 652         addi r4, r4, 8
 653
 654         xoris r6, r5, 30               ;; This is due to a large immediate.
 655         cmplwi cr0, r6, 33920
 656         bne cr0, LBB1_1
 657
 658 //===---------------------------------------------------------------------===//
 659
 660 This:
 661         #include <algorithm>
 662         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 663         { return std::make_pair(a + b, a + b < a); }
 664         bool no_overflow(unsigned a, unsigned b)
 665         { return !full_add(a, b).second; }
 666
 667 Should compile to:
 668
 669 __Z11no_overflowjj:
 670         add r4,r3,r4
 671         subfc r3,r3,r4
 672         li r3,0
 673         adde r3,r3,r3
 674         blr
 675
 676 (or better) not:
 677
 678 __Z11no_overflowjj:
 679         add r2, r4, r3
 680         cmplw cr7, r2, r3
 681         mfcr r2
 682         rlwinm r2, r2, 29, 31, 31
 683         xori r3, r2, 1
 684         blr
 685
 686 //===---------------------------------------------------------------------===//
 687
 688 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
 689 example:
 690 #include <math.h>
 691 int test(double x, double y) { return islessequal(x, y);}
 692 int test2(double x, double y) {  return islessgreater(x, y);}
 693 int test3(double x, double y) {  return !islessequal(x, y);}
 694
 695 Compiles into (all three are similar, but the bits differ):
 696
 697 _test:
 698         fcmpu cr7, f1, f2
 699         mfcr r2
 700         rlwinm r3, r2, 29, 31, 31
 701         rlwinm r2, r2, 31, 31, 31
 702         or r3, r2, r3
 703         blr
 704
 705 GCC compiles this into:
 706
 707  _test:
 708         fcmpu cr7,f1,f2
 709         cror 30,28,30
 710         mfcr r3
 711         rlwinm r3,r3,31,1
 712         blr
 713
 714 which is more efficient and can use mfocr.  See PR642 for some more context.
 715
 716 //===---------------------------------------------------------------------===//
 717
 718 void foo(float *data, float d) {
 719    long i;
 720    for (i = 0; i < 8000; i++)
 721       data[i] = d;
 722 }
 723 void foo2(float *data, float d) {
 724    long i;
 725    data--;
 726    for (i = 0; i < 8000; i++) {
 727       data[1] = d;
 728       data++;
 729    }
 730 }
 731
 732 These compile to:
 733
 734 _foo:
 735         li r2, 0
 736 LBB1_1: ; bb
 737         addi r4, r2, 4
 738         stfsx f1, r3, r2
 739         cmplwi cr0, r4, 32000
 740         mr r2, r4
 741         bne cr0, LBB1_1 ; bb
 742         blr
 743 _foo2:
 744         li r2, 0
 745 LBB2_1: ; bb
 746         addi r4, r2, 4
 747         stfsx f1, r3, r2
 748         cmplwi cr0, r4, 32000
 749         mr r2, r4
 750         bne cr0, LBB2_1 ; bb
 751         blr
 752
 753 The 'mr' could be eliminated to folding the add into the cmp better.
 754
 755 //===---------------------------------------------------------------------===//