lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
   6
   7 ===-------------------------------------------------------------------------===
   8
   9 On PPC64, this:
  10
  11 long f2 (long x) { return 0xfffffff000000000UL; }
  12 long f3 (long x) { return 0x1ffffffffUL; }
  13
  14 could compile into:
  15
  16 _f2:
  17         li r3,-1
  18         rldicr r3,r3,0,27
  19         blr
  20 _f3:
  21         li r3,-1
  22         rldicl r3,r3,0,31
  23         blr
  24
  25 we produce:
  26
  27 _f2:
  28         lis r2, 4095
  29         ori r2, r2, 65535
  30         sldi r3, r2, 36
  31         blr
  32 _f3:
  33         li r2, 1
  34         sldi r2, r2, 32
  35         oris r2, r2, 65535
  36         ori r3, r2, 65535
  37         blr
  38
  39 ===-------------------------------------------------------------------------===
  40
  41 This code:
  42
  43 unsigned add32carry(unsigned sum, unsigned x) {
  44  unsigned z = sum + x;
  45  if (sum + x < x)
  46      z++;
  47  return z;
  48 }
  49
  50 Should compile to something like:
  51
  52         addc r3,r3,r4
  53         addze r3,r3
  54
  55 instead we get:
  56
  57         add r3, r4, r3
  58         cmplw cr7, r3, r4
  59         mfcr r4 ; 1
  60         rlwinm r4, r4, 29, 31, 31
  61         add r3, r3, r4
  62
  63 Ick.
  64
  65 ===-------------------------------------------------------------------------===
  66
  67 Support 'update' load/store instructions.  These are cracked on the G5, but are
  68 still a codesize win.
  69
  70 With preinc enabled, this:
  71
  72 long *%test4(long *%X, long *%dest) {
  73         %Y = getelementptr long* %X, int 4
  74         %A = load long* %Y
  75         store long %A, long* %dest
  76         ret long* %Y
  77 }
  78
  79 compiles to:
  80
  81 _test4:
  82         mr r2, r3
  83         lwzu r5, 32(r2)
  84         lwz r3, 36(r3)
  85         stw r5, 0(r4)
  86         stw r3, 4(r4)
  87         mr r3, r2
  88         blr
  89
  90 with -sched=list-burr, I get:
  91
  92 _test4:
  93         lwz r2, 36(r3)
  94         lwzu r5, 32(r3)
  95         stw r2, 4(r4)
  96         stw r5, 0(r4)
  97         blr
  98
  99 ===-------------------------------------------------------------------------===
 100
 101 We compile the hottest inner loop of viterbi to:
 102
 103         li r6, 0
 104         b LBB1_84       ;bb432.i
 105 LBB1_83:        ;bb420.i
 106         lbzx r8, r5, r7
 107         addi r6, r7, 1
 108         stbx r8, r4, r7
 109 LBB1_84:        ;bb432.i
 110         mr r7, r6
 111         cmplwi cr0, r7, 143
 112         bne cr0, LBB1_83        ;bb420.i
 113
 114 The CBE manages to produce:
 115
 116         li r0, 143
 117         mtctr r0
 118 loop:
 119         lbzx r2, r2, r11
 120         stbx r0, r2, r9
 121         addi r2, r2, 1
 122         bdz later
 123         b loop
 124
 125 This could be much better (bdnz instead of bdz) but it still beats us.  If we
 126 produced this with bdnz, the loop would be a single dispatch group.
 127
 128 ===-------------------------------------------------------------------------===
 129
 130 Compile:
 131
 132 void foo(int *P) {
 133  if (P)  *P = 0;
 134 }
 135
 136 into:
 137
 138 _foo:
 139         cmpwi cr0,r3,0
 140         beqlr cr0
 141         li r0,0
 142         stw r0,0(r3)
 143         blr
 144
 145 This is effectively a simple form of predication.
 146
 147 ===-------------------------------------------------------------------------===
 148
 149 Lump the constant pool for each function into ONE pic object, and reference
 150 pieces of it as offsets from the start.  For functions like this (contrived
 151 to have lots of constants obviously):
 152
 153 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
 154
 155 We generate:
 156
 157 _X:
 158         lis r2, ha16(.CPI_X_0)
 159         lfd f0, lo16(.CPI_X_0)(r2)
 160         lis r2, ha16(.CPI_X_1)
 161         lfd f2, lo16(.CPI_X_1)(r2)
 162         fmadd f0, f1, f0, f2
 163         lis r2, ha16(.CPI_X_2)
 164         lfd f1, lo16(.CPI_X_2)(r2)
 165         lis r2, ha16(.CPI_X_3)
 166         lfd f2, lo16(.CPI_X_3)(r2)
 167         fmadd f1, f0, f1, f2
 168         blr
 169
 170 It would be better to materialize .CPI_X into a register, then use immediates
 171 off of the register to avoid the lis's.  This is even more important in PIC
 172 mode.
 173
 174 Note that this (and the static variable version) is discussed here for GCC:
 175 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 176
 177 Here's another example (the sgn function):
 178 double testf(double a) {
 179        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 180 }
 181
 182 it produces a BB like this:
 183 LBB1_1: ; cond_true
 184         lis r2, ha16(LCPI1_0)
 185         lfs f0, lo16(LCPI1_0)(r2)
 186         lis r2, ha16(LCPI1_1)
 187         lis r3, ha16(LCPI1_2)
 188         lfs f2, lo16(LCPI1_2)(r3)
 189         lfs f3, lo16(LCPI1_1)(r2)
 190         fsub f0, f0, f1
 191         fsel f1, f0, f2, f3
 192         blr
 193
 194 ===-------------------------------------------------------------------------===
 195
 196 PIC Code Gen IPO optimization:
 197
 198 Squish small scalar globals together into a single global struct, allowing the
 199 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 200 of the GOT on targets with one).
 201
 202 Note that this is discussed here for GCC:
 203 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 204
 205 ===-------------------------------------------------------------------------===
 206
 207 Implement Newton-Rhapson method for improving estimate instructions to the
 208 correct accuracy, and implementing divide as multiply by reciprocal when it has
 209 more than one use.  Itanium would want this too.
 210
 211 ===-------------------------------------------------------------------------===
 212
 213 Compile offsets from allocas:
 214
 215 int *%test() {
 216         %X = alloca { int, int }
 217         %Y = getelementptr {int,int}* %X, int 0, uint 1
 218         ret int* %Y
 219 }
 220
 221 into a single add, not two:
 222
 223 _test:
 224         addi r2, r1, -8
 225         addi r3, r2, 4
 226         blr
 227
 228 --> important for C++.
 229
 230 ===-------------------------------------------------------------------------===
 231
 232 No loads or stores of the constants should be needed:
 233
 234 struct foo { double X, Y; };
 235 void xxx(struct foo F);
 236 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 237
 238 ===-------------------------------------------------------------------------===
 239
 240 Darwin Stub removal:
 241
 242 We still generate calls to foo$stub, and stubs, on Darwin.  This is not
 243 necessary when building with the Leopard (10.5) or later linker, as stubs are
 244 generated by ld when necessary.  Parameterizing this based on the deployment
 245 target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
 246 its logic.
 247
 248 ===-------------------------------------------------------------------------===
 249
 250 Darwin Stub LICM optimization:
 251
 252 Loops like this:
 253
 254   for (...)  bar();
 255
 256 Have to go through an indirect stub if bar is external or linkonce.  It would
 257 be better to compile it as:
 258
 259      fp = &bar;
 260      for (...)  fp();
 261
 262 which only computes the address of bar once (instead of each time through the
 263 stub).  This is Darwin specific and would have to be done in the code generator.
 264 Probably not a win on x86.
 265
 266 ===-------------------------------------------------------------------------===
 267
 268 Simple IPO for argument passing, change:
 269   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 270
 271 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 272 of arguments get assigned to r3 through r10. That is, if you have a function
 273 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 274 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 275 order for functions we can internalize so that the maximum number of
 276 integers/pointers get passed in regs before you see any of the fp arguments.
 277
 278 Instead of implementing this, it would actually probably be easier to just
 279 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 280 including having this work sanely.
 281
 282 ===-------------------------------------------------------------------------===
 283
 284 Fix Darwin FP-In-Integer Registers ABI
 285
 286 Darwin passes doubles in structures in integer registers, which is very very
 287 bad.  Add something like a BITCAST to LLVM, then do an i-p transformation that
 288 percolates these things out of functions.
 289
 290 Check out how horrible this is:
 291 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 292
 293 This is an extension of "interprocedural CC unmunging" that can't be done with
 294 just fastcc.
 295
 296 ===-------------------------------------------------------------------------===
 297
 298 Compile this:
 299
 300 int foo(int a) {
 301   int b = (a < 8);
 302   if (b) {
 303     return b * 3;     // ignore the fact that this is always 3.
 304   } else {
 305     return 2;
 306   }
 307 }
 308
 309 into something not this:
 310
 311 _foo:
 312 1)      cmpwi cr7, r3, 8
 313         mfcr r2, 1
 314         rlwinm r2, r2, 29, 31, 31
 315 1)      cmpwi cr0, r3, 7
 316         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 317 LBB1_1: ; then
 318         rlwinm r2, r2, 0, 31, 31
 319         mulli r3, r2, 3
 320         blr
 321 LBB1_2: ; UnifiedReturnBlock
 322         li r3, 2
 323         blr
 324
 325 In particular, the two compares (marked 1) could be shared by reversing one.
 326 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 327 same operands (but backwards) exists.  In this case, this wouldn't save us
 328 anything though, because the compares still wouldn't be shared.
 329
 330 ===-------------------------------------------------------------------------===
 331
 332 We should custom expand setcc instead of pretending that we have it.  That
 333 would allow us to expose the access of the crbit after the mfcr, allowing
 334 that access to be trivially folded into other ops.  A simple example:
 335
 336 int foo(int a, int b) { return (a < b) << 4; }
 337
 338 compiles into:
 339
 340 _foo:
 341         cmpw cr7, r3, r4
 342         mfcr r2, 1
 343         rlwinm r2, r2, 29, 31, 31
 344         slwi r3, r2, 4
 345         blr
 346
 347 ===-------------------------------------------------------------------------===
 348
 349 Fold add and sub with constant into non-extern, non-weak addresses so this:
 350
 351 static int a;
 352 void bar(int b) { a = b; }
 353 void foo(unsigned char *c) {
 354   *c = a;
 355 }
 356
 357 So that
 358
 359 _foo:
 360         lis r2, ha16(_a)
 361         la r2, lo16(_a)(r2)
 362         lbz r2, 3(r2)
 363         stb r2, 0(r3)
 364         blr
 365
 366 Becomes
 367
 368 _foo:
 369         lis r2, ha16(_a+3)
 370         lbz r2, lo16(_a+3)(r2)
 371         stb r2, 0(r3)
 372         blr
 373
 374 ===-------------------------------------------------------------------------===
 375
 376 We generate really bad code for this:
 377
 378 int f(signed char *a, _Bool b, _Bool c) {
 379    signed char t = 0;
 380   if (b)  t = *a;
 381   if (c)  *a = t;
 382 }
 383
 384 ===-------------------------------------------------------------------------===
 385
 386 This:
 387 int test(unsigned *P) { return *P >> 24; }
 388
 389 Should compile to:
 390
 391 _test:
 392         lbz r3,0(r3)
 393         blr
 394
 395 not:
 396
 397 _test:
 398         lwz r2, 0(r3)
 399         srwi r3, r2, 24
 400         blr
 401
 402 ===-------------------------------------------------------------------------===
 403
 404 On the G5, logical CR operations are more expensive in their three
 405 address form: ops that read/write the same register are half as expensive as
 406 those that read from two registers that are different from their destination.
 407
 408 We should model this with two separate instructions.  The isel should generate
 409 the "two address" form of the instructions.  When the register allocator
 410 detects that it needs to insert a copy due to the two-addresness of the CR
 411 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 412 we can convert to the "three address" instruction, to save code space.
 413
 414 This only matters when we start generating cr logical ops.
 415
 416 ===-------------------------------------------------------------------------===
 417
 418 We should compile these two functions to the same thing:
 419
 420 #include <stdlib.h>
 421 void f(int a, int b, int *P) {
 422   *P = (a-b)>=0?(a-b):(b-a);
 423 }
 424 void g(int a, int b, int *P) {
 425   *P = abs(a-b);
 426 }
 427
 428 Further, they should compile to something better than:
 429
 430 _g:
 431         subf r2, r4, r3
 432         subfic r3, r2, 0
 433         cmpwi cr0, r2, -1
 434         bgt cr0, LBB2_2 ; entry
 435 LBB2_1: ; entry
 436         mr r2, r3
 437 LBB2_2: ; entry
 438         stw r2, 0(r5)
 439         blr
 440
 441 GCC produces:
 442
 443 _g:
 444         subf r4,r4,r3
 445         srawi r2,r4,31
 446         xor r0,r2,r4
 447         subf r0,r2,r0
 448         stw r0,0(r5)
 449         blr
 450
 451 ... which is much nicer.
 452
 453 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 454
 455 ===-------------------------------------------------------------------------===
 456
 457 PR5945: This:
 458 define i32 @clamp0g(i32 %a) {
 459 entry:
 460         %cmp = icmp slt i32 %a, 0
 461         %sel = select i1 %cmp, i32 0, i32 %a
 462         ret i32 %sel
 463 }
 464
 465 Is compile to this with the PowerPC (32-bit) backend:
 466
 467 _clamp0g:
 468         cmpwi cr0, r3, 0
 469         li r2, 0
 470         blt cr0, LBB1_2
 471 ; BB#1:                                                     ; %entry
 472         mr r2, r3
 473 LBB1_2:                                                     ; %entry
 474         mr r3, r2
 475         blr
 476
 477 This could be reduced to the much simpler:
 478
 479 _clamp0g:
 480         srawi r2, r3, 31
 481         andc r3, r3, r2
 482         blr
 483
 484 ===-------------------------------------------------------------------------===
 485
 486 int foo(int N, int ***W, int **TK, int X) {
 487   int t, i;
 488
 489   for (t = 0; t < N; ++t)
 490     for (i = 0; i < 4; ++i)
 491       W[t / X][i][t % X] = TK[i][t];
 492
 493   return 5;
 494 }
 495
 496 We generate relatively atrocious code for this loop compared to gcc.
 497
 498 We could also strength reduce the rem and the div:
 499 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 500
 501 ===-------------------------------------------------------------------------===
 502
 503 float foo(float X) { return (int)(X); }
 504
 505 Currently produces:
 506
 507 _foo:
 508         fctiwz f0, f1
 509         stfd f0, -8(r1)
 510         lwz r2, -4(r1)
 511         extsw r2, r2
 512         std r2, -16(r1)
 513         lfd f0, -16(r1)
 514         fcfid f0, f0
 515         frsp f1, f0
 516         blr
 517
 518 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 519 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 520 win only.
 521
 522 ===-------------------------------------------------------------------------===
 523
 524 We generate ugly code for this:
 525
 526 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 527   unsigned code = 0;
 528   if(dx < -dw) code |= 1;
 529   if(dx > dw)  code |= 2;
 530   if(dy < -dw) code |= 4;
 531   if(dy > dw)  code |= 8;
 532   if(dz < -dw) code |= 16;
 533   if(dz > dw)  code |= 32;
 534   *ret = code;
 535 }
 536
 537 ===-------------------------------------------------------------------------===
 538
 539 Complete the signed i32 to FP conversion code using 64-bit registers
 540 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 541
 542      // FIXME: disable this lowered code.  This generates 64-bit register values,
 543      // and we don't model the fact that the top part is clobbered by calls.  We
 544      // need to flag these together so that the value isn't live across a call.
 545      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 546
 547 Also, if the registers are spilled to the stack, we have to ensure that all
 548 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 549 sounds like we need to get the 64-bit register classes going.
 550
 551 ===-------------------------------------------------------------------------===
 552
 553 %struct.B = type { i8, [3 x i8] }
 554
 555 define void @bar(%struct.B* %b) {
 556 entry:
 557         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 558         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 559         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 560         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 561         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 562         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 563         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 564         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 565         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 566         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 567         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 568         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 569         store i32 %tmp13, i32* %tmp8
 570         ret void
 571 }
 572
 573 We emit:
 574
 575 _foo:
 576         lwz r2, 0(r3)
 577         slwi r4, r2, 1
 578         or r4, r4, r2
 579         rlwimi r2, r4, 0, 0, 0
 580         stw r2, 0(r3)
 581         blr
 582
 583 We could collapse a bunch of those ORs and ANDs and generate the following
 584 equivalent code:
 585
 586 _foo:
 587         lwz r2, 0(r3)
 588         rlwinm r4, r2, 1, 0, 0
 589         or r2, r2, r4
 590         stw r2, 0(r3)
 591         blr
 592
 593 ===-------------------------------------------------------------------------===
 594
 595 We compile:
 596
 597 unsigned test6(unsigned x) {
 598   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 599 }
 600
 601 into:
 602
 603 _test6:
 604         lis r2, 255
 605         rlwinm r3, r3, 16, 0, 31
 606         ori r2, r2, 255
 607         and r3, r3, r2
 608         blr
 609
 610 GCC gets it down to:
 611
 612 _test6:
 613         rlwinm r0,r3,16,8,15
 614         rlwinm r3,r3,16,24,31
 615         or r3,r3,r0
 616         blr
 617
 618
 619 ===-------------------------------------------------------------------------===
 620
 621 Consider a function like this:
 622
 623 float foo(float X) { return X + 1234.4123f; }
 624
 625 The FP constant ends up in the constant pool, so we need to get the LR register.
 626  This ends up producing code like this:
 627
 628 _foo:
 629 .LBB_foo_0:     ; entry
 630         mflr r11
 631 ***     stw r11, 8(r1)
 632         bl "L00000$pb"
 633 "L00000$pb":
 634         mflr r2
 635         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 636         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 637         fadds f1, f1, f0
 638 ***     lwz r11, 8(r1)
 639         mtlr r11
 640         blr
 641
 642 This is functional, but there is no reason to spill the LR register all the way
 643 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 644
 645 Implementing this will require some codegen improvements.  Nate writes:
 646
 647 "So basically what we need to support the "no stack frame save and restore" is a
 648 generalization of the LR optimization to "callee-save regs".
 649
 650 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 651 that it's callee save, and spills it directly to the stack.
 652
 653 Ideally, something like this would happen:
 654
 655 LR would be in a separate register class from the GPRs. The class of LR would be
 656 marked "unspillable".  When the register allocator came across an unspillable
 657 reg, it would ask "what is the best class to copy this into that I *can* spill"
 658 If it gets a class back, which it will in this case (the gprs), it grabs a free
 659 register of that class.  If it is then later necessary to spill that reg, so be
 660 it.
 661
 662 ===-------------------------------------------------------------------------===
 663
 664 We compile this:
 665 int test(_Bool X) {
 666   return X ? 524288 : 0;
 667 }
 668
 669 to:
 670 _test:
 671         cmplwi cr0, r3, 0
 672         lis r2, 8
 673         li r3, 0
 674         beq cr0, LBB1_2 ;entry
 675 LBB1_1: ;entry
 676         mr r3, r2
 677 LBB1_2: ;entry
 678         blr
 679
 680 instead of:
 681 _test:
 682         addic r2,r3,-1
 683         subfe r0,r2,r3
 684         slwi r3,r0,19
 685         blr
 686
 687 This sort of thing occurs a lot due to globalopt.
 688
 689 ===-------------------------------------------------------------------------===
 690
 691 We compile:
 692
 693 define i32 @bar(i32 %x) nounwind readnone ssp {
 694 entry:
 695   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
 696   %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
 697   ret i32 %neg
 698 }
 699
 700 to:
 701
 702 _bar:
 703         cntlzw r2, r3
 704         slwi r2, r2, 26
 705         srawi r3, r2, 31
 706         blr
 707
 708 it would be better to produce:
 709
 710 _bar:
 711         addic r3,r3,-1
 712         subfe r3,r3,r3
 713         blr
 714
 715 ===-------------------------------------------------------------------------===
 716
 717 We currently compile 32-bit bswap:
 718
 719 declare i32 @llvm.bswap.i32(i32 %A)
 720 define i32 @test(i32 %A) {
 721         %B = call i32 @llvm.bswap.i32(i32 %A)
 722         ret i32 %B
 723 }
 724
 725 to:
 726
 727 _test:
 728         rlwinm r2, r3, 24, 16, 23
 729         slwi r4, r3, 24
 730         rlwimi r2, r3, 8, 24, 31
 731         rlwimi r4, r3, 8, 8, 15
 732         rlwimi r4, r2, 0, 16, 31
 733         mr r3, r4
 734         blr
 735
 736 it would be more efficient to produce:
 737
 738 _foo:   mr r0,r3
 739         rlwinm r3,r3,8,0xffffffff
 740         rlwimi r3,r0,24,0,7
 741         rlwimi r3,r0,24,16,23
 742         blr
 743
 744 ===-------------------------------------------------------------------------===
 745
 746 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
 747
 748 __ZNK4llvm5APInt17countLeadingZerosEv:
 749         ld r2, 0(r3)
 750         cntlzd r2, r2
 751         or r2, r2, r2     <<-- silly.
 752         addi r3, r2, -64
 753         blr
 754
 755 The dead or is a 'truncate' from 64- to 32-bits.
 756
 757 ===-------------------------------------------------------------------------===
 758
 759 We generate horrible ppc code for this:
 760
 761 #define N  2000000
 762 double   a[N],c[N];
 763 void simpleloop() {
 764    int j;
 765    for (j=0; j<N; j++)
 766      c[j] = a[j];
 767 }
 768
 769 LBB1_1: ;bb
 770         lfdx f0, r3, r4
 771         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 772         stfdx f0, r2, r4
 773         addi r4, r4, 8
 774
 775         xoris r6, r5, 30               ;; This is due to a large immediate.
 776         cmplwi cr0, r6, 33920
 777         bne cr0, LBB1_1
 778
 779 //===---------------------------------------------------------------------===//
 780
 781 This:
 782         #include <algorithm>
 783         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 784         { return std::make_pair(a + b, a + b < a); }
 785         bool no_overflow(unsigned a, unsigned b)
 786         { return !full_add(a, b).second; }
 787
 788 Should compile to:
 789
 790 __Z11no_overflowjj:
 791         add r4,r3,r4
 792         subfc r3,r3,r4
 793         li r3,0
 794         adde r3,r3,r3
 795         blr
 796
 797 (or better) not:
 798
 799 __Z11no_overflowjj:
 800         add r2, r4, r3
 801         cmplw cr7, r2, r3
 802         mfcr r2
 803         rlwinm r2, r2, 29, 31, 31
 804         xori r3, r2, 1
 805         blr
 806
 807 //===---------------------------------------------------------------------===//
 808
 809 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
 810 example:
 811 #include <math.h>
 812 int test(double x, double y) { return islessequal(x, y);}
 813 int test2(double x, double y) {  return islessgreater(x, y);}
 814 int test3(double x, double y) {  return !islessequal(x, y);}
 815
 816 Compiles into (all three are similar, but the bits differ):
 817
 818 _test:
 819         fcmpu cr7, f1, f2
 820         mfcr r2
 821         rlwinm r3, r2, 29, 31, 31
 822         rlwinm r2, r2, 31, 31, 31
 823         or r3, r2, r3
 824         blr
 825
 826 GCC compiles this into:
 827
 828  _test:
 829         fcmpu cr7,f1,f2
 830         cror 30,28,30
 831         mfcr r3
 832         rlwinm r3,r3,31,1
 833         blr
 834
 835 which is more efficient and can use mfocr.  See PR642 for some more context.
 836
 837 //===---------------------------------------------------------------------===//
 838
 839 void foo(float *data, float d) {
 840    long i;
 841    for (i = 0; i < 8000; i++)
 842       data[i] = d;
 843 }
 844 void foo2(float *data, float d) {
 845    long i;
 846    data--;
 847    for (i = 0; i < 8000; i++) {
 848       data[1] = d;
 849       data++;
 850    }
 851 }
 852
 853 These compile to:
 854
 855 _foo:
 856         li r2, 0
 857 LBB1_1: ; bb
 858         addi r4, r2, 4
 859         stfsx f1, r3, r2
 860         cmplwi cr0, r4, 32000
 861         mr r2, r4
 862         bne cr0, LBB1_1 ; bb
 863         blr
 864 _foo2:
 865         li r2, 0
 866 LBB2_1: ; bb
 867         addi r4, r2, 4
 868         stfsx f1, r3, r2
 869         cmplwi cr0, r4, 32000
 870         mr r2, r4
 871         bne cr0, LBB2_1 ; bb
 872         blr
 873
 874 The 'mr' could be eliminated to folding the add into the cmp better.
 875
 876 //===---------------------------------------------------------------------===//
 877 Codegen for the following (low-probability) case deteriorated considerably
 878 when the correctness fixes for unordered comparisons went in (PR 642, 58871).
 879 It should be possible to recover the code quality described in the comments.
 880
 881 ; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
 882 ; This should produce one 'or' or 'cror' instruction per function.
 883
 884 ; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
 885 ; PR2964
 886
 887 define i32 @test(double %x, double %y) nounwind  {
 888 entry:
 889         %tmp3 = fcmp ole double %x, %y          ; <i1> [#uses=1]
 890         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 891         ret i32 %tmp345
 892 }
 893
 894 define i32 @test2(double %x, double %y) nounwind  {
 895 entry:
 896         %tmp3 = fcmp one double %x, %y          ; <i1> [#uses=1]
 897         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 898         ret i32 %tmp345
 899 }
 900
 901 define i32 @test3(double %x, double %y) nounwind  {
 902 entry:
 903         %tmp3 = fcmp ugt double %x, %y          ; <i1> [#uses=1]
 904         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
 905         ret i32 %tmp34
 906 }
 907 //===----------------------------------------------------------------------===//
 908 ; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
 909
 910 ; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
 911 ; should not be generated except with -enable-finite-only-fp-math or the like).
 912 ; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
 913 ; recognize a more elaborate tree than a simple SETxx.
 914
 915 define double @test_FNEG_sel(double %A, double %B, double %C) {
 916         %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
 917         %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
 918         %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
 919         ret double %E
 920 }
 921
 922 //===----------------------------------------------------------------------===//
 923 The save/restore sequence for CR in prolog/epilog is terrible:
 924 - Each CR subreg is saved individually, rather than doing one save as a unit.
 925 - On Darwin, the save is done after the decrement of SP, which means the offset
 926 from SP of the save slot can be too big for a store instruction, which means we
 927 need an additional register (currently hacked in 96015+96020; the solution there
 928 is correct, but poor).
 929 - On SVR4 the same thing can happen, and I don't think saving before the SP
 930 decrement is safe on that target, as there is no red zone.  This is currently
 931 broken AFAIK, although it's not a target I can exercise.
 932 The following demonstrates the problem:
 933 extern void bar(char *p);
 934 void foo() {
 935   char x[100000];
 936   bar(x);
 937   __asm__("" ::: "cr2");
 938 }