lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6 * Implement __builtin_trap (ISD::TRAP) as 'tw 31, 0, 0' aka 'trap'.
   7 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
   8
   9 ===-------------------------------------------------------------------------===
  10
  11 Support 'update' load/store instructions.  These are cracked on the G5, but are
  12 still a codesize win.
  13
  14 With preinc enabled, this:
  15
  16 long *%test4(long *%X, long *%dest) {
  17         %Y = getelementptr long* %X, int 4
  18         %A = load long* %Y
  19         store long %A, long* %dest
  20         ret long* %Y
  21 }
  22
  23 compiles to:
  24
  25 _test4:
  26         mr r2, r3
  27         lwzu r5, 32(r2)
  28         lwz r3, 36(r3)
  29         stw r5, 0(r4)
  30         stw r3, 4(r4)
  31         mr r3, r2
  32         blr
  33
  34 with -sched=list-burr, I get:
  35
  36 _test4:
  37         lwz r2, 36(r3)
  38         lwzu r5, 32(r3)
  39         stw r2, 4(r4)
  40         stw r5, 0(r4)
  41         blr
  42
  43 ===-------------------------------------------------------------------------===
  44
  45 We compile the hottest inner loop of viterbi to:
  46
  47         li r6, 0
  48         b LBB1_84       ;bb432.i
  49 LBB1_83:        ;bb420.i
  50         lbzx r8, r5, r7
  51         addi r6, r7, 1
  52         stbx r8, r4, r7
  53 LBB1_84:        ;bb432.i
  54         mr r7, r6
  55         cmplwi cr0, r7, 143
  56         bne cr0, LBB1_83        ;bb420.i
  57
  58 The CBE manages to produce:
  59
  60         li r0, 143
  61         mtctr r0
  62 loop:
  63         lbzx r2, r2, r11
  64         stbx r0, r2, r9
  65         addi r2, r2, 1
  66         bdz later
  67         b loop
  68
  69 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  70 produced this with bdnz, the loop would be a single dispatch group.
  71
  72 ===-------------------------------------------------------------------------===
  73
  74 Compile:
  75
  76 void foo(int *P) {
  77  if (P)  *P = 0;
  78 }
  79
  80 into:
  81
  82 _foo:
  83         cmpwi cr0,r3,0
  84         beqlr cr0
  85         li r0,0
  86         stw r0,0(r3)
  87         blr
  88
  89 This is effectively a simple form of predication.
  90
  91 ===-------------------------------------------------------------------------===
  92
  93 Lump the constant pool for each function into ONE pic object, and reference
  94 pieces of it as offsets from the start.  For functions like this (contrived
  95 to have lots of constants obviously):
  96
  97 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  98
  99 We generate:
 100
 101 _X:
 102         lis r2, ha16(.CPI_X_0)
 103         lfd f0, lo16(.CPI_X_0)(r2)
 104         lis r2, ha16(.CPI_X_1)
 105         lfd f2, lo16(.CPI_X_1)(r2)
 106         fmadd f0, f1, f0, f2
 107         lis r2, ha16(.CPI_X_2)
 108         lfd f1, lo16(.CPI_X_2)(r2)
 109         lis r2, ha16(.CPI_X_3)
 110         lfd f2, lo16(.CPI_X_3)(r2)
 111         fmadd f1, f0, f1, f2
 112         blr
 113
 114 It would be better to materialize .CPI_X into a register, then use immediates
 115 off of the register to avoid the lis's.  This is even more important in PIC
 116 mode.
 117
 118 Note that this (and the static variable version) is discussed here for GCC:
 119 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 120
 121 Here's another example (the sgn function):
 122 double testf(double a) {
 123        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 124 }
 125
 126 it produces a BB like this:
 127 LBB1_1: ; cond_true
 128         lis r2, ha16(LCPI1_0)
 129         lfs f0, lo16(LCPI1_0)(r2)
 130         lis r2, ha16(LCPI1_1)
 131         lis r3, ha16(LCPI1_2)
 132         lfs f2, lo16(LCPI1_2)(r3)
 133         lfs f3, lo16(LCPI1_1)(r2)
 134         fsub f0, f0, f1
 135         fsel f1, f0, f2, f3
 136         blr
 137
 138 ===-------------------------------------------------------------------------===
 139
 140 PIC Code Gen IPO optimization:
 141
 142 Squish small scalar globals together into a single global struct, allowing the
 143 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 144 of the GOT on targets with one).
 145
 146 Note that this is discussed here for GCC:
 147 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 148
 149 ===-------------------------------------------------------------------------===
 150
 151 Implement Newton-Rhapson method for improving estimate instructions to the
 152 correct accuracy, and implementing divide as multiply by reciprocal when it has
 153 more than one use.  Itanium will want this too.
 154
 155 ===-------------------------------------------------------------------------===
 156
 157 Compile this:
 158
 159 int %f1(int %a, int %b) {
 160         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 161         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 162         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 163         ret int %tmp.4
 164 }
 165
 166 without a copy.  We make this currently:
 167
 168 _f1:
 169         rlwinm r2, r4, 0, 24, 27
 170         rlwimi r2, r3, 0, 28, 31
 171         or r3, r2, r2
 172         blr
 173
 174 The two-addr pass or RA needs to learn when it is profitable to commute an
 175 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 176 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 177
 178 ===-------------------------------------------------------------------------===
 179
 180 Compile offsets from allocas:
 181
 182 int *%test() {
 183         %X = alloca { int, int }
 184         %Y = getelementptr {int,int}* %X, int 0, uint 1
 185         ret int* %Y
 186 }
 187
 188 into a single add, not two:
 189
 190 _test:
 191         addi r2, r1, -8
 192         addi r3, r2, 4
 193         blr
 194
 195 --> important for C++.
 196
 197 ===-------------------------------------------------------------------------===
 198
 199 No loads or stores of the constants should be needed:
 200
 201 struct foo { double X, Y; };
 202 void xxx(struct foo F);
 203 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 204
 205 ===-------------------------------------------------------------------------===
 206
 207 Darwin Stub LICM optimization:
 208
 209 Loops like this:
 210
 211   for (...)  bar();
 212
 213 Have to go through an indirect stub if bar is external or linkonce.  It would
 214 be better to compile it as:
 215
 216      fp = &bar;
 217      for (...)  fp();
 218
 219 which only computes the address of bar once (instead of each time through the
 220 stub).  This is Darwin specific and would have to be done in the code generator.
 221 Probably not a win on x86.
 222
 223 ===-------------------------------------------------------------------------===
 224
 225 Simple IPO for argument passing, change:
 226   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 227
 228 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 229 of arguments get assigned to r3 through r10. That is, if you have a function
 230 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 231 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 232 order for functions we can internalize so that the maximum number of
 233 integers/pointers get passed in regs before you see any of the fp arguments.
 234
 235 Instead of implementing this, it would actually probably be easier to just
 236 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 237 including having this work sanely.
 238
 239 ===-------------------------------------------------------------------------===
 240
 241 Fix Darwin FP-In-Integer Registers ABI
 242
 243 Darwin passes doubles in structures in integer registers, which is very very
 244 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 245 that percolates these things out of functions.
 246
 247 Check out how horrible this is:
 248 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 249
 250 This is an extension of "interprocedural CC unmunging" that can't be done with
 251 just fastcc.
 252
 253 ===-------------------------------------------------------------------------===
 254
 255 Compile this:
 256
 257 int foo(int a) {
 258   int b = (a < 8);
 259   if (b) {
 260     return b * 3;     // ignore the fact that this is always 3.
 261   } else {
 262     return 2;
 263   }
 264 }
 265
 266 into something not this:
 267
 268 _foo:
 269 1)      cmpwi cr7, r3, 8
 270         mfcr r2, 1
 271         rlwinm r2, r2, 29, 31, 31
 272 1)      cmpwi cr0, r3, 7
 273         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 274 LBB1_1: ; then
 275         rlwinm r2, r2, 0, 31, 31
 276         mulli r3, r2, 3
 277         blr
 278 LBB1_2: ; UnifiedReturnBlock
 279         li r3, 2
 280         blr
 281
 282 In particular, the two compares (marked 1) could be shared by reversing one.
 283 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 284 same operands (but backwards) exists.  In this case, this wouldn't save us
 285 anything though, because the compares still wouldn't be shared.
 286
 287 ===-------------------------------------------------------------------------===
 288
 289 We should custom expand setcc instead of pretending that we have it.  That
 290 would allow us to expose the access of the crbit after the mfcr, allowing
 291 that access to be trivially folded into other ops.  A simple example:
 292
 293 int foo(int a, int b) { return (a < b) << 4; }
 294
 295 compiles into:
 296
 297 _foo:
 298         cmpw cr7, r3, r4
 299         mfcr r2, 1
 300         rlwinm r2, r2, 29, 31, 31
 301         slwi r3, r2, 4
 302         blr
 303
 304 ===-------------------------------------------------------------------------===
 305
 306 Fold add and sub with constant into non-extern, non-weak addresses so this:
 307
 308 static int a;
 309 void bar(int b) { a = b; }
 310 void foo(unsigned char *c) {
 311   *c = a;
 312 }
 313
 314 So that
 315
 316 _foo:
 317         lis r2, ha16(_a)
 318         la r2, lo16(_a)(r2)
 319         lbz r2, 3(r2)
 320         stb r2, 0(r3)
 321         blr
 322
 323 Becomes
 324
 325 _foo:
 326         lis r2, ha16(_a+3)
 327         lbz r2, lo16(_a+3)(r2)
 328         stb r2, 0(r3)
 329         blr
 330
 331 ===-------------------------------------------------------------------------===
 332
 333 We generate really bad code for this:
 334
 335 int f(signed char *a, _Bool b, _Bool c) {
 336    signed char t = 0;
 337   if (b)  t = *a;
 338   if (c)  *a = t;
 339 }
 340
 341 ===-------------------------------------------------------------------------===
 342
 343 This:
 344 int test(unsigned *P) { return *P >> 24; }
 345
 346 Should compile to:
 347
 348 _test:
 349         lbz r3,0(r3)
 350         blr
 351
 352 not:
 353
 354 _test:
 355         lwz r2, 0(r3)
 356         srwi r3, r2, 24
 357         blr
 358
 359 ===-------------------------------------------------------------------------===
 360
 361 On the G5, logical CR operations are more expensive in their three
 362 address form: ops that read/write the same register are half as expensive as
 363 those that read from two registers that are different from their destination.
 364
 365 We should model this with two separate instructions.  The isel should generate
 366 the "two address" form of the instructions.  When the register allocator
 367 detects that it needs to insert a copy due to the two-addresness of the CR
 368 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 369 we can convert to the "three address" instruction, to save code space.
 370
 371 This only matters when we start generating cr logical ops.
 372
 373 ===-------------------------------------------------------------------------===
 374
 375 We should compile these two functions to the same thing:
 376
 377 #include <stdlib.h>
 378 void f(int a, int b, int *P) {
 379   *P = (a-b)>=0?(a-b):(b-a);
 380 }
 381 void g(int a, int b, int *P) {
 382   *P = abs(a-b);
 383 }
 384
 385 Further, they should compile to something better than:
 386
 387 _g:
 388         subf r2, r4, r3
 389         subfic r3, r2, 0
 390         cmpwi cr0, r2, -1
 391         bgt cr0, LBB2_2 ; entry
 392 LBB2_1: ; entry
 393         mr r2, r3
 394 LBB2_2: ; entry
 395         stw r2, 0(r5)
 396         blr
 397
 398 GCC produces:
 399
 400 _g:
 401         subf r4,r4,r3
 402         srawi r2,r4,31
 403         xor r0,r2,r4
 404         subf r0,r2,r0
 405         stw r0,0(r5)
 406         blr
 407
 408 ... which is much nicer.
 409
 410 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 411
 412 ===-------------------------------------------------------------------------===
 413
 414 int foo(int N, int ***W, int **TK, int X) {
 415   int t, i;
 416
 417   for (t = 0; t < N; ++t)
 418     for (i = 0; i < 4; ++i)
 419       W[t / X][i][t % X] = TK[i][t];
 420
 421   return 5;
 422 }
 423
 424 We generate relatively atrocious code for this loop compared to gcc.
 425
 426 We could also strength reduce the rem and the div:
 427 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 428
 429 ===-------------------------------------------------------------------------===
 430
 431 float foo(float X) { return (int)(X); }
 432
 433 Currently produces:
 434
 435 _foo:
 436         fctiwz f0, f1
 437         stfd f0, -8(r1)
 438         lwz r2, -4(r1)
 439         extsw r2, r2
 440         std r2, -16(r1)
 441         lfd f0, -16(r1)
 442         fcfid f0, f0
 443         frsp f1, f0
 444         blr
 445
 446 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 447 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 448 win only.
 449
 450 ===-------------------------------------------------------------------------===
 451
 452 We generate ugly code for this:
 453
 454 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 455   unsigned code = 0;
 456   if(dx < -dw) code |= 1;
 457   if(dx > dw)  code |= 2;
 458   if(dy < -dw) code |= 4;
 459   if(dy > dw)  code |= 8;
 460   if(dz < -dw) code |= 16;
 461   if(dz > dw)  code |= 32;
 462   *ret = code;
 463 }
 464
 465 ===-------------------------------------------------------------------------===
 466
 467 Complete the signed i32 to FP conversion code using 64-bit registers
 468 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 469
 470      // FIXME: disable this lowered code.  This generates 64-bit register values,
 471      // and we don't model the fact that the top part is clobbered by calls.  We
 472      // need to flag these together so that the value isn't live across a call.
 473      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 474
 475 Also, if the registers are spilled to the stack, we have to ensure that all
 476 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 477 sounds like we need to get the 64-bit register classes going.
 478
 479 ===-------------------------------------------------------------------------===
 480
 481 %struct.B = type { i8, [3 x i8] }
 482
 483 define void @bar(%struct.B* %b) {
 484 entry:
 485         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 486         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 487         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 488         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 489         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 490         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 491         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 492         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 493         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 494         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 495         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 496         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 497         store i32 %tmp13, i32* %tmp8
 498         ret void
 499 }
 500
 501 We emit:
 502
 503 _foo:
 504         lwz r2, 0(r3)
 505         slwi r4, r2, 1
 506         or r4, r4, r2
 507         rlwimi r2, r4, 0, 0, 0
 508         stw r2, 0(r3)
 509         blr
 510
 511 We could collapse a bunch of those ORs and ANDs and generate the following
 512 equivalent code:
 513
 514 _foo:
 515         lwz r2, 0(r3)
 516         rlwinm r4, r2, 1, 0, 0
 517         or r2, r2, r4
 518         stw r2, 0(r3)
 519         blr
 520
 521 ===-------------------------------------------------------------------------===
 522
 523 We compile:
 524
 525 unsigned test6(unsigned x) {
 526   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 527 }
 528
 529 into:
 530
 531 _test6:
 532         lis r2, 255
 533         rlwinm r3, r3, 16, 0, 31
 534         ori r2, r2, 255
 535         and r3, r3, r2
 536         blr
 537
 538 GCC gets it down to:
 539
 540 _test6:
 541         rlwinm r0,r3,16,8,15
 542         rlwinm r3,r3,16,24,31
 543         or r3,r3,r0
 544         blr
 545
 546
 547 ===-------------------------------------------------------------------------===
 548
 549 Consider a function like this:
 550
 551 float foo(float X) { return X + 1234.4123f; }
 552
 553 The FP constant ends up in the constant pool, so we need to get the LR register.
 554  This ends up producing code like this:
 555
 556 _foo:
 557 .LBB_foo_0:     ; entry
 558         mflr r11
 559 ***     stw r11, 8(r1)
 560         bl "L00000$pb"
 561 "L00000$pb":
 562         mflr r2
 563         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 564         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 565         fadds f1, f1, f0
 566 ***     lwz r11, 8(r1)
 567         mtlr r11
 568         blr
 569
 570 This is functional, but there is no reason to spill the LR register all the way
 571 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 572
 573 Implementing this will require some codegen improvements.  Nate writes:
 574
 575 "So basically what we need to support the "no stack frame save and restore" is a
 576 generalization of the LR optimization to "callee-save regs".
 577
 578 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 579 that it's callee save, and spills it directly to the stack.
 580
 581 Ideally, something like this would happen:
 582
 583 LR would be in a separate register class from the GPRs. The class of LR would be
 584 marked "unspillable".  When the register allocator came across an unspillable
 585 reg, it would ask "what is the best class to copy this into that I *can* spill"
 586 If it gets a class back, which it will in this case (the gprs), it grabs a free
 587 register of that class.  If it is then later necessary to spill that reg, so be
 588 it.
 589
 590 ===-------------------------------------------------------------------------===
 591
 592 We compile this:
 593 int test(_Bool X) {
 594   return X ? 524288 : 0;
 595 }
 596
 597 to:
 598 _test:
 599         cmplwi cr0, r3, 0
 600         lis r2, 8
 601         li r3, 0
 602         beq cr0, LBB1_2 ;entry
 603 LBB1_1: ;entry
 604         mr r3, r2
 605 LBB1_2: ;entry
 606         blr
 607
 608 instead of:
 609 _test:
 610         addic r2,r3,-1
 611         subfe r0,r2,r3
 612         slwi r3,r0,19
 613         blr
 614
 615 This sort of thing occurs a lot due to globalopt.
 616
 617 ===-------------------------------------------------------------------------===
 618
 619 We currently compile 32-bit bswap:
 620
 621 declare i32 @llvm.bswap.i32(i32 %A)
 622 define i32 @test(i32 %A) {
 623         %B = call i32 @llvm.bswap.i32(i32 %A)
 624         ret i32 %B
 625 }
 626
 627 to:
 628
 629 _test:
 630         rlwinm r2, r3, 24, 16, 23
 631         slwi r4, r3, 24
 632         rlwimi r2, r3, 8, 24, 31
 633         rlwimi r4, r3, 8, 8, 15
 634         rlwimi r4, r2, 0, 16, 31
 635         mr r3, r4
 636         blr
 637
 638 it would be more efficient to produce:
 639
 640 _foo:   mr r0,r3
 641         rlwinm r3,r3,8,0xffffffff
 642         rlwimi r3,r0,24,0,7
 643         rlwimi r3,r0,24,16,23
 644         blr
 645
 646 ===-------------------------------------------------------------------------===
 647
 648 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
 649
 650 __ZNK4llvm5APInt17countLeadingZerosEv:
 651         ld r2, 0(r3)
 652         cntlzd r2, r2
 653         or r2, r2, r2     <<-- silly.
 654         addi r3, r2, -64
 655         blr
 656
 657 The dead or is a 'truncate' from 64- to 32-bits.
 658
 659 ===-------------------------------------------------------------------------===
 660
 661 We generate horrible ppc code for this:
 662
 663 #define N  2000000
 664 double   a[N],c[N];
 665 void simpleloop() {
 666    int j;
 667    for (j=0; j<N; j++)
 668      c[j] = a[j];
 669 }
 670
 671 LBB1_1: ;bb
 672         lfdx f0, r3, r4
 673         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 674         stfdx f0, r2, r4
 675         addi r4, r4, 8
 676
 677         xoris r6, r5, 30               ;; This is due to a large immediate.
 678         cmplwi cr0, r6, 33920
 679         bne cr0, LBB1_1
 680
 681 //===---------------------------------------------------------------------===//
 682
 683 This:
 684         #include <algorithm>
 685         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 686         { return std::make_pair(a + b, a + b < a); }
 687         bool no_overflow(unsigned a, unsigned b)
 688         { return !full_add(a, b).second; }
 689
 690 Should compile to:
 691
 692 __Z11no_overflowjj:
 693         add r4,r3,r4
 694         subfc r3,r3,r4
 695         li r3,0
 696         adde r3,r3,r3
 697         blr
 698
 699 (or better) not:
 700
 701 __Z11no_overflowjj:
 702         add r2, r4, r3
 703         cmplw cr7, r2, r3
 704         mfcr r2
 705         rlwinm r2, r2, 29, 31, 31
 706         xori r3, r2, 1
 707         blr
 708
 709 //===---------------------------------------------------------------------===//
 710
 711 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
 712 example:
 713 #include <math.h>
 714 int test(double x, double y) { return islessequal(x, y);}
 715 int test2(double x, double y) {  return islessgreater(x, y);}
 716 int test3(double x, double y) {  return !islessequal(x, y);}
 717
 718 Compiles into (all three are similar, but the bits differ):
 719
 720 _test:
 721         fcmpu cr7, f1, f2
 722         mfcr r2
 723         rlwinm r3, r2, 29, 31, 31
 724         rlwinm r2, r2, 31, 31, 31
 725         or r3, r2, r3
 726         blr
 727
 728 GCC compiles this into:
 729
 730  _test:
 731         fcmpu cr7,f1,f2
 732         cror 30,28,30
 733         mfcr r3
 734         rlwinm r3,r3,31,1
 735         blr
 736
 737 which is more efficient and can use mfocr.  See PR642 for some more context.
 738
 739 //===---------------------------------------------------------------------===//