lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6
   7 ===-------------------------------------------------------------------------===
   8
   9 Support 'update' load/store instructions.  These are cracked on the G5, but are
  10 still a codesize win.
  11
  12 With preinc enabled, this:
  13
  14 long *%test4(long *%X, long *%dest) {
  15         %Y = getelementptr long* %X, int 4
  16         %A = load long* %Y
  17         store long %A, long* %dest
  18         ret long* %Y
  19 }
  20
  21 compiles to:
  22
  23 _test4:
  24         mr r2, r3
  25         lwzu r5, 32(r2)
  26         lwz r3, 36(r3)
  27         stw r5, 0(r4)
  28         stw r3, 4(r4)
  29         mr r3, r2
  30         blr
  31
  32 with -sched=list-burr, I get:
  33
  34 _test4:
  35         lwz r2, 36(r3)
  36         lwzu r5, 32(r3)
  37         stw r2, 4(r4)
  38         stw r5, 0(r4)
  39         blr
  40
  41 ===-------------------------------------------------------------------------===
  42
  43 We compile the hottest inner loop of viterbi to:
  44
  45         li r6, 0
  46         b LBB1_84       ;bb432.i
  47 LBB1_83:        ;bb420.i
  48         lbzx r8, r5, r7
  49         addi r6, r7, 1
  50         stbx r8, r4, r7
  51 LBB1_84:        ;bb432.i
  52         mr r7, r6
  53         cmplwi cr0, r7, 143
  54         bne cr0, LBB1_83        ;bb420.i
  55
  56 The CBE manages to produce:
  57
  58         li r0, 143
  59         mtctr r0
  60 loop:
  61         lbzx r2, r2, r11
  62         stbx r0, r2, r9
  63         addi r2, r2, 1
  64         bdz later
  65         b loop
  66
  67 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  68 produced this with bdnz, the loop would be a single dispatch group.
  69
  70 ===-------------------------------------------------------------------------===
  71
  72 Compile:
  73
  74 void foo(int *P) {
  75  if (P)  *P = 0;
  76 }
  77
  78 into:
  79
  80 _foo:
  81         cmpwi cr0,r3,0
  82         beqlr cr0
  83         li r0,0
  84         stw r0,0(r3)
  85         blr
  86
  87 This is effectively a simple form of predication.
  88
  89 ===-------------------------------------------------------------------------===
  90
  91 Lump the constant pool for each function into ONE pic object, and reference
  92 pieces of it as offsets from the start.  For functions like this (contrived
  93 to have lots of constants obviously):
  94
  95 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  96
  97 We generate:
  98
  99 _X:
 100         lis r2, ha16(.CPI_X_0)
 101         lfd f0, lo16(.CPI_X_0)(r2)
 102         lis r2, ha16(.CPI_X_1)
 103         lfd f2, lo16(.CPI_X_1)(r2)
 104         fmadd f0, f1, f0, f2
 105         lis r2, ha16(.CPI_X_2)
 106         lfd f1, lo16(.CPI_X_2)(r2)
 107         lis r2, ha16(.CPI_X_3)
 108         lfd f2, lo16(.CPI_X_3)(r2)
 109         fmadd f1, f0, f1, f2
 110         blr
 111
 112 It would be better to materialize .CPI_X into a register, then use immediates
 113 off of the register to avoid the lis's.  This is even more important in PIC
 114 mode.
 115
 116 Note that this (and the static variable version) is discussed here for GCC:
 117 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 118
 119 Here's another example (the sgn function):
 120 double testf(double a) {
 121        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 122 }
 123
 124 it produces a BB like this:
 125 LBB1_1: ; cond_true
 126         lis r2, ha16(LCPI1_0)
 127         lfs f0, lo16(LCPI1_0)(r2)
 128         lis r2, ha16(LCPI1_1)
 129         lis r3, ha16(LCPI1_2)
 130         lfs f2, lo16(LCPI1_2)(r3)
 131         lfs f3, lo16(LCPI1_1)(r2)
 132         fsub f0, f0, f1
 133         fsel f1, f0, f2, f3
 134         blr
 135
 136 ===-------------------------------------------------------------------------===
 137
 138 PIC Code Gen IPO optimization:
 139
 140 Squish small scalar globals together into a single global struct, allowing the
 141 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 142 of the GOT on targets with one).
 143
 144 Note that this is discussed here for GCC:
 145 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 146
 147 ===-------------------------------------------------------------------------===
 148
 149 Implement Newton-Rhapson method for improving estimate instructions to the
 150 correct accuracy, and implementing divide as multiply by reciprocal when it has
 151 more than one use.  Itanium will want this too.
 152
 153 ===-------------------------------------------------------------------------===
 154
 155 Compile this:
 156
 157 int %f1(int %a, int %b) {
 158         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 159         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 160         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 161         ret int %tmp.4
 162 }
 163
 164 without a copy.  We make this currently:
 165
 166 _f1:
 167         rlwinm r2, r4, 0, 24, 27
 168         rlwimi r2, r3, 0, 28, 31
 169         or r3, r2, r2
 170         blr
 171
 172 The two-addr pass or RA needs to learn when it is profitable to commute an
 173 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 174 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 175
 176 ===-------------------------------------------------------------------------===
 177
 178 Compile offsets from allocas:
 179
 180 int *%test() {
 181         %X = alloca { int, int }
 182         %Y = getelementptr {int,int}* %X, int 0, uint 1
 183         ret int* %Y
 184 }
 185
 186 into a single add, not two:
 187
 188 _test:
 189         addi r2, r1, -8
 190         addi r3, r2, 4
 191         blr
 192
 193 --> important for C++.
 194
 195 ===-------------------------------------------------------------------------===
 196
 197 No loads or stores of the constants should be needed:
 198
 199 struct foo { double X, Y; };
 200 void xxx(struct foo F);
 201 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 202
 203 ===-------------------------------------------------------------------------===
 204
 205 Darwin Stub LICM optimization:
 206
 207 Loops like this:
 208
 209   for (...)  bar();
 210
 211 Have to go through an indirect stub if bar is external or linkonce.  It would
 212 be better to compile it as:
 213
 214      fp = &bar;
 215      for (...)  fp();
 216
 217 which only computes the address of bar once (instead of each time through the
 218 stub).  This is Darwin specific and would have to be done in the code generator.
 219 Probably not a win on x86.
 220
 221 ===-------------------------------------------------------------------------===
 222
 223 Simple IPO for argument passing, change:
 224   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 225
 226 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 227 of arguments get assigned to r3 through r10. That is, if you have a function
 228 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 229 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 230 order for functions we can internalize so that the maximum number of
 231 integers/pointers get passed in regs before you see any of the fp arguments.
 232
 233 Instead of implementing this, it would actually probably be easier to just
 234 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 235 including having this work sanely.
 236
 237 ===-------------------------------------------------------------------------===
 238
 239 Fix Darwin FP-In-Integer Registers ABI
 240
 241 Darwin passes doubles in structures in integer registers, which is very very
 242 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 243 that percolates these things out of functions.
 244
 245 Check out how horrible this is:
 246 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 247
 248 This is an extension of "interprocedural CC unmunging" that can't be done with
 249 just fastcc.
 250
 251 ===-------------------------------------------------------------------------===
 252
 253 Compile this:
 254
 255 int foo(int a) {
 256   int b = (a < 8);
 257   if (b) {
 258     return b * 3;     // ignore the fact that this is always 3.
 259   } else {
 260     return 2;
 261   }
 262 }
 263
 264 into something not this:
 265
 266 _foo:
 267 1)      cmpwi cr7, r3, 8
 268         mfcr r2, 1
 269         rlwinm r2, r2, 29, 31, 31
 270 1)      cmpwi cr0, r3, 7
 271         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 272 LBB1_1: ; then
 273         rlwinm r2, r2, 0, 31, 31
 274         mulli r3, r2, 3
 275         blr
 276 LBB1_2: ; UnifiedReturnBlock
 277         li r3, 2
 278         blr
 279
 280 In particular, the two compares (marked 1) could be shared by reversing one.
 281 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 282 same operands (but backwards) exists.  In this case, this wouldn't save us
 283 anything though, because the compares still wouldn't be shared.
 284
 285 ===-------------------------------------------------------------------------===
 286
 287 We should custom expand setcc instead of pretending that we have it.  That
 288 would allow us to expose the access of the crbit after the mfcr, allowing
 289 that access to be trivially folded into other ops.  A simple example:
 290
 291 int foo(int a, int b) { return (a < b) << 4; }
 292
 293 compiles into:
 294
 295 _foo:
 296         cmpw cr7, r3, r4
 297         mfcr r2, 1
 298         rlwinm r2, r2, 29, 31, 31
 299         slwi r3, r2, 4
 300         blr
 301
 302 ===-------------------------------------------------------------------------===
 303
 304 Fold add and sub with constant into non-extern, non-weak addresses so this:
 305
 306 static int a;
 307 void bar(int b) { a = b; }
 308 void foo(unsigned char *c) {
 309   *c = a;
 310 }
 311
 312 So that
 313
 314 _foo:
 315         lis r2, ha16(_a)
 316         la r2, lo16(_a)(r2)
 317         lbz r2, 3(r2)
 318         stb r2, 0(r3)
 319         blr
 320
 321 Becomes
 322
 323 _foo:
 324         lis r2, ha16(_a+3)
 325         lbz r2, lo16(_a+3)(r2)
 326         stb r2, 0(r3)
 327         blr
 328
 329 ===-------------------------------------------------------------------------===
 330
 331 We generate really bad code for this:
 332
 333 int f(signed char *a, _Bool b, _Bool c) {
 334    signed char t = 0;
 335   if (b)  t = *a;
 336   if (c)  *a = t;
 337 }
 338
 339 ===-------------------------------------------------------------------------===
 340
 341 This:
 342 int test(unsigned *P) { return *P >> 24; }
 343
 344 Should compile to:
 345
 346 _test:
 347         lbz r3,0(r3)
 348         blr
 349
 350 not:
 351
 352 _test:
 353         lwz r2, 0(r3)
 354         srwi r3, r2, 24
 355         blr
 356
 357 ===-------------------------------------------------------------------------===
 358
 359 On the G5, logical CR operations are more expensive in their three
 360 address form: ops that read/write the same register are half as expensive as
 361 those that read from two registers that are different from their destination.
 362
 363 We should model this with two separate instructions.  The isel should generate
 364 the "two address" form of the instructions.  When the register allocator
 365 detects that it needs to insert a copy due to the two-addresness of the CR
 366 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 367 we can convert to the "three address" instruction, to save code space.
 368
 369 This only matters when we start generating cr logical ops.
 370
 371 ===-------------------------------------------------------------------------===
 372
 373 We should compile these two functions to the same thing:
 374
 375 #include <stdlib.h>
 376 void f(int a, int b, int *P) {
 377   *P = (a-b)>=0?(a-b):(b-a);
 378 }
 379 void g(int a, int b, int *P) {
 380   *P = abs(a-b);
 381 }
 382
 383 Further, they should compile to something better than:
 384
 385 _g:
 386         subf r2, r4, r3
 387         subfic r3, r2, 0
 388         cmpwi cr0, r2, -1
 389         bgt cr0, LBB2_2 ; entry
 390 LBB2_1: ; entry
 391         mr r2, r3
 392 LBB2_2: ; entry
 393         stw r2, 0(r5)
 394         blr
 395
 396 GCC produces:
 397
 398 _g:
 399         subf r4,r4,r3
 400         srawi r2,r4,31
 401         xor r0,r2,r4
 402         subf r0,r2,r0
 403         stw r0,0(r5)
 404         blr
 405
 406 ... which is much nicer.
 407
 408 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 409
 410 ===-------------------------------------------------------------------------===
 411
 412 int foo(int N, int ***W, int **TK, int X) {
 413   int t, i;
 414
 415   for (t = 0; t < N; ++t)
 416     for (i = 0; i < 4; ++i)
 417       W[t / X][i][t % X] = TK[i][t];
 418
 419   return 5;
 420 }
 421
 422 We generate relatively atrocious code for this loop compared to gcc.
 423
 424 We could also strength reduce the rem and the div:
 425 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 426
 427 ===-------------------------------------------------------------------------===
 428
 429 float foo(float X) { return (int)(X); }
 430
 431 Currently produces:
 432
 433 _foo:
 434         fctiwz f0, f1
 435         stfd f0, -8(r1)
 436         lwz r2, -4(r1)
 437         extsw r2, r2
 438         std r2, -16(r1)
 439         lfd f0, -16(r1)
 440         fcfid f0, f0
 441         frsp f1, f0
 442         blr
 443
 444 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 445 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 446 win only.
 447
 448 ===-------------------------------------------------------------------------===
 449
 450 We generate ugly code for this:
 451
 452 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 453   unsigned code = 0;
 454   if(dx < -dw) code |= 1;
 455   if(dx > dw)  code |= 2;
 456   if(dy < -dw) code |= 4;
 457   if(dy > dw)  code |= 8;
 458   if(dz < -dw) code |= 16;
 459   if(dz > dw)  code |= 32;
 460   *ret = code;
 461 }
 462
 463 ===-------------------------------------------------------------------------===
 464
 465 Complete the signed i32 to FP conversion code using 64-bit registers
 466 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 467
 468      // FIXME: disable this lowered code.  This generates 64-bit register values,
 469      // and we don't model the fact that the top part is clobbered by calls.  We
 470      // need to flag these together so that the value isn't live across a call.
 471      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 472
 473 Also, if the registers are spilled to the stack, we have to ensure that all
 474 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 475 sounds like we need to get the 64-bit register classes going.
 476
 477 ===-------------------------------------------------------------------------===
 478
 479 %struct.B = type { i8, [3 x i8] }
 480
 481 define void @bar(%struct.B* %b) {
 482 entry:
 483         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 484         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 485         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 486         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 487         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 488         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 489         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 490         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 491         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 492         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 493         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 494         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 495         store i32 %tmp13, i32* %tmp8
 496         ret void
 497 }
 498
 499 We emit:
 500
 501 _foo:
 502         lwz r2, 0(r3)
 503         slwi r4, r2, 1
 504         or r4, r4, r2
 505         rlwimi r2, r4, 0, 0, 0
 506         stw r2, 0(r3)
 507         blr
 508
 509 We could collapse a bunch of those ORs and ANDs and generate the following
 510 equivalent code:
 511
 512 _foo:
 513         lwz r2, 0(r3)
 514         rlwinm r4, r2, 1, 0, 0
 515         or r2, r2, r4
 516         stw r2, 0(r3)
 517         blr
 518
 519 ===-------------------------------------------------------------------------===
 520
 521 We compile:
 522
 523 unsigned test6(unsigned x) {
 524   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 525 }
 526
 527 into:
 528
 529 _test6:
 530         lis r2, 255
 531         rlwinm r3, r3, 16, 0, 31
 532         ori r2, r2, 255
 533         and r3, r3, r2
 534         blr
 535
 536 GCC gets it down to:
 537
 538 _test6:
 539         rlwinm r0,r3,16,8,15
 540         rlwinm r3,r3,16,24,31
 541         or r3,r3,r0
 542         blr
 543
 544
 545 ===-------------------------------------------------------------------------===
 546
 547 Consider a function like this:
 548
 549 float foo(float X) { return X + 1234.4123f; }
 550
 551 The FP constant ends up in the constant pool, so we need to get the LR register.
 552  This ends up producing code like this:
 553
 554 _foo:
 555 .LBB_foo_0:     ; entry
 556         mflr r11
 557 ***     stw r11, 8(r1)
 558         bl "L00000$pb"
 559 "L00000$pb":
 560         mflr r2
 561         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 562         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 563         fadds f1, f1, f0
 564 ***     lwz r11, 8(r1)
 565         mtlr r11
 566         blr
 567
 568 This is functional, but there is no reason to spill the LR register all the way
 569 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 570
 571 Implementing this will require some codegen improvements.  Nate writes:
 572
 573 "So basically what we need to support the "no stack frame save and restore" is a
 574 generalization of the LR optimization to "callee-save regs".
 575
 576 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 577 that it's callee save, and spills it directly to the stack.
 578
 579 Ideally, something like this would happen:
 580
 581 LR would be in a separate register class from the GPRs. The class of LR would be
 582 marked "unspillable".  When the register allocator came across an unspillable
 583 reg, it would ask "what is the best class to copy this into that I *can* spill"
 584 If it gets a class back, which it will in this case (the gprs), it grabs a free
 585 register of that class.  If it is then later necessary to spill that reg, so be
 586 it.
 587
 588 ===-------------------------------------------------------------------------===
 589
 590 We compile this:
 591 int test(_Bool X) {
 592   return X ? 524288 : 0;
 593 }
 594
 595 to:
 596 _test:
 597         cmplwi cr0, r3, 0
 598         lis r2, 8
 599         li r3, 0
 600         beq cr0, LBB1_2 ;entry
 601 LBB1_1: ;entry
 602         mr r3, r2
 603 LBB1_2: ;entry
 604         blr
 605
 606 instead of:
 607 _test:
 608         addic r2,r3,-1
 609         subfe r0,r2,r3
 610         slwi r3,r0,19
 611         blr
 612
 613 This sort of thing occurs a lot due to globalopt.
 614
 615 ===-------------------------------------------------------------------------===
 616
 617 We currently compile 32-bit bswap:
 618
 619 declare i32 @llvm.bswap.i32(i32 %A)
 620 define i32 @test(i32 %A) {
 621         %B = call i32 @llvm.bswap.i32(i32 %A)
 622         ret i32 %B
 623 }
 624
 625 to:
 626
 627 _test:
 628         rlwinm r2, r3, 24, 16, 23
 629         slwi r4, r3, 24
 630         rlwimi r2, r3, 8, 24, 31
 631         rlwimi r4, r3, 8, 8, 15
 632         rlwimi r4, r2, 0, 16, 31
 633         mr r3, r4
 634         blr
 635
 636 it would be more efficient to produce:
 637
 638 _foo:   mr r0,r3
 639         rlwinm r3,r3,8,0xffffffff
 640         rlwimi r3,r0,24,0,7
 641         rlwimi r3,r0,24,16,23
 642         blr
 643
 644 ===-------------------------------------------------------------------------===
 645
 646 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
 647
 648 __ZNK4llvm5APInt17countLeadingZerosEv:
 649         ld r2, 0(r3)
 650         cntlzd r2, r2
 651         or r2, r2, r2     <<-- silly.
 652         addi r3, r2, -64
 653         blr
 654
 655 The dead or is a 'truncate' from 64- to 32-bits.
 656
 657 ===-------------------------------------------------------------------------===
 658
 659 We generate horrible ppc code for this:
 660
 661 #define N  2000000
 662 double   a[N],c[N];
 663 void simpleloop() {
 664    int j;
 665    for (j=0; j<N; j++)
 666      c[j] = a[j];
 667 }
 668
 669 LBB1_1: ;bb
 670         lfdx f0, r3, r4
 671         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 672         stfdx f0, r2, r4
 673         addi r4, r4, 8
 674
 675         xoris r6, r5, 30               ;; This is due to a large immediate.
 676         cmplwi cr0, r6, 33920
 677         bne cr0, LBB1_1
 678
 679 //===---------------------------------------------------------------------===//
 680
 681 This:
 682         #include <algorithm>
 683         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 684         { return std::make_pair(a + b, a + b < a); }
 685         bool no_overflow(unsigned a, unsigned b)
 686         { return !full_add(a, b).second; }
 687
 688 Should compile to:
 689
 690 __Z11no_overflowjj:
 691         add r4,r3,r4
 692         subfc r3,r3,r4
 693         li r3,0
 694         adde r3,r3,r3
 695         blr
 696
 697 (or better) not:
 698
 699 __Z11no_overflowjj:
 700         add r2, r4, r3
 701         cmplw cr7, r2, r3
 702         mfcr r2
 703         rlwinm r2, r2, 29, 31, 31
 704         xori r3, r2, 1
 705         blr
 706
 707 //===---------------------------------------------------------------------===//
 708