lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 This code:
   9
  10 unsigned add32carry(unsigned sum, unsigned x) {
  11  unsigned z = sum + x;
  12  if (sum + x < x)
  13      z++;
  14  return z;
  15 }
  16
  17 Should compile to something like:
  18
  19         addc r3,r3,r4
  20         addze r3,r3
  21
  22 instead we get:
  23
  24         add r3, r4, r3
  25         cmplw cr7, r3, r4
  26         mfcr r4 ; 1
  27         rlwinm r4, r4, 29, 31, 31
  28         add r3, r3, r4
  29
  30 Ick.
  31
  32 ===-------------------------------------------------------------------------===
  33
  34 We compile the hottest inner loop of viterbi to:
  35
  36         li r6, 0
  37         b LBB1_84       ;bb432.i
  38 LBB1_83:        ;bb420.i
  39         lbzx r8, r5, r7
  40         addi r6, r7, 1
  41         stbx r8, r4, r7
  42 LBB1_84:        ;bb432.i
  43         mr r7, r6
  44         cmplwi cr0, r7, 143
  45         bne cr0, LBB1_83        ;bb420.i
  46
  47 The CBE manages to produce:
  48
  49         li r0, 143
  50         mtctr r0
  51 loop:
  52         lbzx r2, r2, r11
  53         stbx r0, r2, r9
  54         addi r2, r2, 1
  55         bdz later
  56         b loop
  57
  58 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  59 produced this with bdnz, the loop would be a single dispatch group.
  60
  61 ===-------------------------------------------------------------------------===
  62
  63 Lump the constant pool for each function into ONE pic object, and reference
  64 pieces of it as offsets from the start.  For functions like this (contrived
  65 to have lots of constants obviously):
  66
  67 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  68
  69 We generate:
  70
  71 _X:
  72         lis r2, ha16(.CPI_X_0)
  73         lfd f0, lo16(.CPI_X_0)(r2)
  74         lis r2, ha16(.CPI_X_1)
  75         lfd f2, lo16(.CPI_X_1)(r2)
  76         fmadd f0, f1, f0, f2
  77         lis r2, ha16(.CPI_X_2)
  78         lfd f1, lo16(.CPI_X_2)(r2)
  79         lis r2, ha16(.CPI_X_3)
  80         lfd f2, lo16(.CPI_X_3)(r2)
  81         fmadd f1, f0, f1, f2
  82         blr
  83
  84 It would be better to materialize .CPI_X into a register, then use immediates
  85 off of the register to avoid the lis's.  This is even more important in PIC
  86 mode.
  87
  88 Note that this (and the static variable version) is discussed here for GCC:
  89 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  90
  91 Here's another example (the sgn function):
  92 double testf(double a) {
  93        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
  94 }
  95
  96 it produces a BB like this:
  97 LBB1_1: ; cond_true
  98         lis r2, ha16(LCPI1_0)
  99         lfs f0, lo16(LCPI1_0)(r2)
 100         lis r2, ha16(LCPI1_1)
 101         lis r3, ha16(LCPI1_2)
 102         lfs f2, lo16(LCPI1_2)(r3)
 103         lfs f3, lo16(LCPI1_1)(r2)
 104         fsub f0, f0, f1
 105         fsel f1, f0, f2, f3
 106         blr
 107
 108 ===-------------------------------------------------------------------------===
 109
 110 PIC Code Gen IPO optimization:
 111
 112 Squish small scalar globals together into a single global struct, allowing the
 113 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 114 of the GOT on targets with one).
 115
 116 Note that this is discussed here for GCC:
 117 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 118
 119 ===-------------------------------------------------------------------------===
 120
 121 Darwin Stub removal:
 122
 123 We still generate calls to foo$stub, and stubs, on Darwin.  This is not
 124 necessary when building with the Leopard (10.5) or later linker, as stubs are
 125 generated by ld when necessary.  Parameterizing this based on the deployment
 126 target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
 127 its logic.
 128
 129 ===-------------------------------------------------------------------------===
 130
 131 Darwin Stub LICM optimization:
 132
 133 Loops like this:
 134
 135   for (...)  bar();
 136
 137 Have to go through an indirect stub if bar is external or linkonce.  It would
 138 be better to compile it as:
 139
 140      fp = &bar;
 141      for (...)  fp();
 142
 143 which only computes the address of bar once (instead of each time through the
 144 stub).  This is Darwin specific and would have to be done in the code generator.
 145 Probably not a win on x86.
 146
 147 ===-------------------------------------------------------------------------===
 148
 149 Simple IPO for argument passing, change:
 150   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 151
 152 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 153 of arguments get assigned to r3 through r10. That is, if you have a function
 154 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 155 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 156 order for functions we can internalize so that the maximum number of
 157 integers/pointers get passed in regs before you see any of the fp arguments.
 158
 159 Instead of implementing this, it would actually probably be easier to just
 160 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 161 including having this work sanely.
 162
 163 ===-------------------------------------------------------------------------===
 164
 165 Fix Darwin FP-In-Integer Registers ABI
 166
 167 Darwin passes doubles in structures in integer registers, which is very very
 168 bad.  Add something like a BITCAST to LLVM, then do an i-p transformation that
 169 percolates these things out of functions.
 170
 171 Check out how horrible this is:
 172 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 173
 174 This is an extension of "interprocedural CC unmunging" that can't be done with
 175 just fastcc.
 176
 177 ===-------------------------------------------------------------------------===
 178
 179 Fold add and sub with constant into non-extern, non-weak addresses so this:
 180
 181 static int a;
 182 void bar(int b) { a = b; }
 183 void foo(unsigned char *c) {
 184   *c = a;
 185 }
 186
 187 So that
 188
 189 _foo:
 190         lis r2, ha16(_a)
 191         la r2, lo16(_a)(r2)
 192         lbz r2, 3(r2)
 193         stb r2, 0(r3)
 194         blr
 195
 196 Becomes
 197
 198 _foo:
 199         lis r2, ha16(_a+3)
 200         lbz r2, lo16(_a+3)(r2)
 201         stb r2, 0(r3)
 202         blr
 203
 204 ===-------------------------------------------------------------------------===
 205
 206 On the G5, logical CR operations are more expensive in their three
 207 address form: ops that read/write the same register are half as expensive as
 208 those that read from two registers that are different from their destination.
 209
 210 We should model this with two separate instructions.  The isel should generate
 211 the "two address" form of the instructions.  When the register allocator
 212 detects that it needs to insert a copy due to the two-addresness of the CR
 213 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 214 we can convert to the "three address" instruction, to save code space.
 215
 216 This only matters when we start generating cr logical ops.
 217
 218 ===-------------------------------------------------------------------------===
 219
 220 We should compile these two functions to the same thing:
 221
 222 #include <stdlib.h>
 223 void f(int a, int b, int *P) {
 224   *P = (a-b)>=0?(a-b):(b-a);
 225 }
 226 void g(int a, int b, int *P) {
 227   *P = abs(a-b);
 228 }
 229
 230 Further, they should compile to something better than:
 231
 232 _g:
 233         subf r2, r4, r3
 234         subfic r3, r2, 0
 235         cmpwi cr0, r2, -1
 236         bgt cr0, LBB2_2 ; entry
 237 LBB2_1: ; entry
 238         mr r2, r3
 239 LBB2_2: ; entry
 240         stw r2, 0(r5)
 241         blr
 242
 243 GCC produces:
 244
 245 _g:
 246         subf r4,r4,r3
 247         srawi r2,r4,31
 248         xor r0,r2,r4
 249         subf r0,r2,r0
 250         stw r0,0(r5)
 251         blr
 252
 253 ... which is much nicer.
 254
 255 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 256
 257 ===-------------------------------------------------------------------------===
 258
 259 PR5945: This:
 260 define i32 @clamp0g(i32 %a) {
 261 entry:
 262         %cmp = icmp slt i32 %a, 0
 263         %sel = select i1 %cmp, i32 0, i32 %a
 264         ret i32 %sel
 265 }
 266
 267 Is compile to this with the PowerPC (32-bit) backend:
 268
 269 _clamp0g:
 270         cmpwi cr0, r3, 0
 271         li r2, 0
 272         blt cr0, LBB1_2
 273 ; BB#1:                                                     ; %entry
 274         mr r2, r3
 275 LBB1_2:                                                     ; %entry
 276         mr r3, r2
 277         blr
 278
 279 This could be reduced to the much simpler:
 280
 281 _clamp0g:
 282         srawi r2, r3, 31
 283         andc r3, r3, r2
 284         blr
 285
 286 ===-------------------------------------------------------------------------===
 287
 288 int foo(int N, int ***W, int **TK, int X) {
 289   int t, i;
 290
 291   for (t = 0; t < N; ++t)
 292     for (i = 0; i < 4; ++i)
 293       W[t / X][i][t % X] = TK[i][t];
 294
 295   return 5;
 296 }
 297
 298 We generate relatively atrocious code for this loop compared to gcc.
 299
 300 We could also strength reduce the rem and the div:
 301 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 302
 303 ===-------------------------------------------------------------------------===
 304
 305 float foo(float X) { return (int)(X); }
 306
 307 Currently produces:
 308
 309 _foo:
 310         fctiwz f0, f1
 311         stfd f0, -8(r1)
 312         lwz r2, -4(r1)
 313         extsw r2, r2
 314         std r2, -16(r1)
 315         lfd f0, -16(r1)
 316         fcfid f0, f0
 317         frsp f1, f0
 318         blr
 319
 320 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 321 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 322 win only.
 323
 324 ===-------------------------------------------------------------------------===
 325
 326 We generate ugly code for this:
 327
 328 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 329   unsigned code = 0;
 330   if(dx < -dw) code |= 1;
 331   if(dx > dw)  code |= 2;
 332   if(dy < -dw) code |= 4;
 333   if(dy > dw)  code |= 8;
 334   if(dz < -dw) code |= 16;
 335   if(dz > dw)  code |= 32;
 336   *ret = code;
 337 }
 338
 339 ===-------------------------------------------------------------------------===
 340
 341 %struct.B = type { i8, [3 x i8] }
 342
 343 define void @bar(%struct.B* %b) {
 344 entry:
 345         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 346         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 347         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 348         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 349         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 350         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 351         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 352         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 353         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 354         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 355         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 356         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 357         store i32 %tmp13, i32* %tmp8
 358         ret void
 359 }
 360
 361 We emit:
 362
 363 _foo:
 364         lwz r2, 0(r3)
 365         slwi r4, r2, 1
 366         or r4, r4, r2
 367         rlwimi r2, r4, 0, 0, 0
 368         stw r2, 0(r3)
 369         blr
 370
 371 We could collapse a bunch of those ORs and ANDs and generate the following
 372 equivalent code:
 373
 374 _foo:
 375         lwz r2, 0(r3)
 376         rlwinm r4, r2, 1, 0, 0
 377         or r2, r2, r4
 378         stw r2, 0(r3)
 379         blr
 380
 381 ===-------------------------------------------------------------------------===
 382
 383 Consider a function like this:
 384
 385 float foo(float X) { return X + 1234.4123f; }
 386
 387 The FP constant ends up in the constant pool, so we need to get the LR register.
 388  This ends up producing code like this:
 389
 390 _foo:
 391 .LBB_foo_0:     ; entry
 392         mflr r11
 393 ***     stw r11, 8(r1)
 394         bl "L00000$pb"
 395 "L00000$pb":
 396         mflr r2
 397         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 398         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 399         fadds f1, f1, f0
 400 ***     lwz r11, 8(r1)
 401         mtlr r11
 402         blr
 403
 404 This is functional, but there is no reason to spill the LR register all the way
 405 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 406
 407 Implementing this will require some codegen improvements.  Nate writes:
 408
 409 "So basically what we need to support the "no stack frame save and restore" is a
 410 generalization of the LR optimization to "callee-save regs".
 411
 412 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 413 that it's callee save, and spills it directly to the stack.
 414
 415 Ideally, something like this would happen:
 416
 417 LR would be in a separate register class from the GPRs. The class of LR would be
 418 marked "unspillable".  When the register allocator came across an unspillable
 419 reg, it would ask "what is the best class to copy this into that I *can* spill"
 420 If it gets a class back, which it will in this case (the gprs), it grabs a free
 421 register of that class.  If it is then later necessary to spill that reg, so be
 422 it.
 423
 424 ===-------------------------------------------------------------------------===
 425
 426 We compile this:
 427 int test(_Bool X) {
 428   return X ? 524288 : 0;
 429 }
 430
 431 to:
 432 _test:
 433         cmplwi cr0, r3, 0
 434         lis r2, 8
 435         li r3, 0
 436         beq cr0, LBB1_2 ;entry
 437 LBB1_1: ;entry
 438         mr r3, r2
 439 LBB1_2: ;entry
 440         blr
 441
 442 instead of:
 443 _test:
 444         addic r2,r3,-1
 445         subfe r0,r2,r3
 446         slwi r3,r0,19
 447         blr
 448
 449 This sort of thing occurs a lot due to globalopt.
 450
 451 ===-------------------------------------------------------------------------===
 452
 453 We compile:
 454
 455 define i32 @bar(i32 %x) nounwind readnone ssp {
 456 entry:
 457   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
 458   %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
 459   ret i32 %neg
 460 }
 461
 462 to:
 463
 464 _bar:
 465         cntlzw r2, r3
 466         slwi r2, r2, 26
 467         srawi r3, r2, 31
 468         blr
 469
 470 it would be better to produce:
 471
 472 _bar:
 473         addic r3,r3,-1
 474         subfe r3,r3,r3
 475         blr
 476
 477 ===-------------------------------------------------------------------------===
 478
 479 We generate horrible ppc code for this:
 480
 481 #define N  2000000
 482 double   a[N],c[N];
 483 void simpleloop() {
 484    int j;
 485    for (j=0; j<N; j++)
 486      c[j] = a[j];
 487 }
 488
 489 LBB1_1: ;bb
 490         lfdx f0, r3, r4
 491         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 492         stfdx f0, r2, r4
 493         addi r4, r4, 8
 494
 495         xoris r6, r5, 30               ;; This is due to a large immediate.
 496         cmplwi cr0, r6, 33920
 497         bne cr0, LBB1_1
 498
 499 //===---------------------------------------------------------------------===//
 500
 501 This:
 502         #include <algorithm>
 503         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 504         { return std::make_pair(a + b, a + b < a); }
 505         bool no_overflow(unsigned a, unsigned b)
 506         { return !full_add(a, b).second; }
 507
 508 Should compile to:
 509
 510 __Z11no_overflowjj:
 511         add r4,r3,r4
 512         subfc r3,r3,r4
 513         li r3,0
 514         adde r3,r3,r3
 515         blr
 516
 517 (or better) not:
 518
 519 __Z11no_overflowjj:
 520         add r2, r4, r3
 521         cmplw cr7, r2, r3
 522         mfcr r2
 523         rlwinm r2, r2, 29, 31, 31
 524         xori r3, r2, 1
 525         blr
 526
 527 //===---------------------------------------------------------------------===//
 528
 529 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
 530 example:
 531 #include <math.h>
 532 int test(double x, double y) { return islessequal(x, y);}
 533 int test2(double x, double y) {  return islessgreater(x, y);}
 534 int test3(double x, double y) {  return !islessequal(x, y);}
 535
 536 Compiles into (all three are similar, but the bits differ):
 537
 538 _test:
 539         fcmpu cr7, f1, f2
 540         mfcr r2
 541         rlwinm r3, r2, 29, 31, 31
 542         rlwinm r2, r2, 31, 31, 31
 543         or r3, r2, r3
 544         blr
 545
 546 GCC compiles this into:
 547
 548  _test:
 549         fcmpu cr7,f1,f2
 550         cror 30,28,30
 551         mfcr r3
 552         rlwinm r3,r3,31,1
 553         blr
 554
 555 which is more efficient and can use mfocr.  See PR642 for some more context.
 556
 557 //===---------------------------------------------------------------------===//
 558
 559 void foo(float *data, float d) {
 560    long i;
 561    for (i = 0; i < 8000; i++)
 562       data[i] = d;
 563 }
 564 void foo2(float *data, float d) {
 565    long i;
 566    data--;
 567    for (i = 0; i < 8000; i++) {
 568       data[1] = d;
 569       data++;
 570    }
 571 }
 572
 573 These compile to:
 574
 575 _foo:
 576         li r2, 0
 577 LBB1_1: ; bb
 578         addi r4, r2, 4
 579         stfsx f1, r3, r2
 580         cmplwi cr0, r4, 32000
 581         mr r2, r4
 582         bne cr0, LBB1_1 ; bb
 583         blr
 584 _foo2:
 585         li r2, 0
 586 LBB2_1: ; bb
 587         addi r4, r2, 4
 588         stfsx f1, r3, r2
 589         cmplwi cr0, r4, 32000
 590         mr r2, r4
 591         bne cr0, LBB2_1 ; bb
 592         blr
 593
 594 The 'mr' could be eliminated to folding the add into the cmp better.
 595
 596 //===---------------------------------------------------------------------===//
 597 Codegen for the following (low-probability) case deteriorated considerably
 598 when the correctness fixes for unordered comparisons went in (PR 642, 58871).
 599 It should be possible to recover the code quality described in the comments.
 600
 601 ; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
 602 ; This should produce one 'or' or 'cror' instruction per function.
 603
 604 ; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
 605 ; PR2964
 606
 607 define i32 @test(double %x, double %y) nounwind  {
 608 entry:
 609         %tmp3 = fcmp ole double %x, %y          ; <i1> [#uses=1]
 610         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 611         ret i32 %tmp345
 612 }
 613
 614 define i32 @test2(double %x, double %y) nounwind  {
 615 entry:
 616         %tmp3 = fcmp one double %x, %y          ; <i1> [#uses=1]
 617         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 618         ret i32 %tmp345
 619 }
 620
 621 define i32 @test3(double %x, double %y) nounwind  {
 622 entry:
 623         %tmp3 = fcmp ugt double %x, %y          ; <i1> [#uses=1]
 624         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
 625         ret i32 %tmp34
 626 }
 627 //===----------------------------------------------------------------------===//
 628 ; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
 629
 630 ; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
 631 ; should not be generated except with -enable-finite-only-fp-math or the like).
 632 ; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
 633 ; recognize a more elaborate tree than a simple SETxx.
 634
 635 define double @test_FNEG_sel(double %A, double %B, double %C) {
 636         %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
 637         %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
 638         %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
 639         ret double %E
 640 }
 641
 642 //===----------------------------------------------------------------------===//
 643 The save/restore sequence for CR in prolog/epilog is terrible:
 644 - Each CR subreg is saved individually, rather than doing one save as a unit.
 645 - On Darwin, the save is done after the decrement of SP, which means the offset
 646 from SP of the save slot can be too big for a store instruction, which means we
 647 need an additional register (currently hacked in 96015+96020; the solution there
 648 is correct, but poor).
 649 - On SVR4 the same thing can happen, and I don't think saving before the SP
 650 decrement is safe on that target, as there is no red zone.  This is currently
 651 broken AFAIK, although it's not a target I can exercise.
 652 The following demonstrates the problem:
 653 extern void bar(char *p);
 654 void foo() {
 655   char x[100000];
 656   bar(x);
 657   __asm__("" ::: "cr2");
 658 }