lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6 * __builtin_return_address not supported on PPC
   7
   8 ===-------------------------------------------------------------------------===
   9
  10 Support 'update' load/store instructions.  These are cracked on the G5, but are
  11 still a codesize win.
  12
  13 With preinc enabled, this:
  14
  15 long *%test4(long *%X, long *%dest) {
  16         %Y = getelementptr long* %X, int 4
  17         %A = load long* %Y
  18         store long %A, long* %dest
  19         ret long* %Y
  20 }
  21
  22 compiles to:
  23
  24 _test4:
  25         mr r2, r3
  26         lwzu r5, 32(r2)
  27         lwz r3, 36(r3)
  28         stw r5, 0(r4)
  29         stw r3, 4(r4)
  30         mr r3, r2
  31         blr
  32
  33 with -sched=list-burr, I get:
  34
  35 _test4:
  36         lwz r2, 36(r3)
  37         lwzu r5, 32(r3)
  38         stw r2, 4(r4)
  39         stw r5, 0(r4)
  40         blr
  41
  42 ===-------------------------------------------------------------------------===
  43
  44 We compile the hottest inner loop of viterbi to:
  45
  46         li r6, 0
  47         b LBB1_84       ;bb432.i
  48 LBB1_83:        ;bb420.i
  49         lbzx r8, r5, r7
  50         addi r6, r7, 1
  51         stbx r8, r4, r7
  52 LBB1_84:        ;bb432.i
  53         mr r7, r6
  54         cmplwi cr0, r7, 143
  55         bne cr0, LBB1_83        ;bb420.i
  56
  57 The CBE manages to produce:
  58
  59         li r0, 143
  60         mtctr r0
  61 loop:
  62         lbzx r2, r2, r11
  63         stbx r0, r2, r9
  64         addi r2, r2, 1
  65         bdz later
  66         b loop
  67
  68 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  69 produced this with bdnz, the loop would be a single dispatch group.
  70
  71 ===-------------------------------------------------------------------------===
  72
  73 Compile:
  74
  75 void foo(int *P) {
  76  if (P)  *P = 0;
  77 }
  78
  79 into:
  80
  81 _foo:
  82         cmpwi cr0,r3,0
  83         beqlr cr0
  84         li r0,0
  85         stw r0,0(r3)
  86         blr
  87
  88 This is effectively a simple form of predication.
  89
  90 ===-------------------------------------------------------------------------===
  91
  92 Lump the constant pool for each function into ONE pic object, and reference
  93 pieces of it as offsets from the start.  For functions like this (contrived
  94 to have lots of constants obviously):
  95
  96 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  97
  98 We generate:
  99
 100 _X:
 101         lis r2, ha16(.CPI_X_0)
 102         lfd f0, lo16(.CPI_X_0)(r2)
 103         lis r2, ha16(.CPI_X_1)
 104         lfd f2, lo16(.CPI_X_1)(r2)
 105         fmadd f0, f1, f0, f2
 106         lis r2, ha16(.CPI_X_2)
 107         lfd f1, lo16(.CPI_X_2)(r2)
 108         lis r2, ha16(.CPI_X_3)
 109         lfd f2, lo16(.CPI_X_3)(r2)
 110         fmadd f1, f0, f1, f2
 111         blr
 112
 113 It would be better to materialize .CPI_X into a register, then use immediates
 114 off of the register to avoid the lis's.  This is even more important in PIC
 115 mode.
 116
 117 Note that this (and the static variable version) is discussed here for GCC:
 118 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 119
 120 ===-------------------------------------------------------------------------===
 121
 122 PIC Code Gen IPO optimization:
 123
 124 Squish small scalar globals together into a single global struct, allowing the
 125 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 126 of the GOT on targets with one).
 127
 128 Note that this is discussed here for GCC:
 129 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 130
 131 ===-------------------------------------------------------------------------===
 132
 133 Implement Newton-Rhapson method for improving estimate instructions to the
 134 correct accuracy, and implementing divide as multiply by reciprocal when it has
 135 more than one use.  Itanium will want this too.
 136
 137 ===-------------------------------------------------------------------------===
 138
 139 Compile this:
 140
 141 int %f1(int %a, int %b) {
 142         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 143         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 144         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 145         ret int %tmp.4
 146 }
 147
 148 without a copy.  We make this currently:
 149
 150 _f1:
 151         rlwinm r2, r4, 0, 24, 27
 152         rlwimi r2, r3, 0, 28, 31
 153         or r3, r2, r2
 154         blr
 155
 156 The two-addr pass or RA needs to learn when it is profitable to commute an
 157 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 158 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 159
 160 ===-------------------------------------------------------------------------===
 161
 162 Compile offsets from allocas:
 163
 164 int *%test() {
 165         %X = alloca { int, int }
 166         %Y = getelementptr {int,int}* %X, int 0, uint 1
 167         ret int* %Y
 168 }
 169
 170 into a single add, not two:
 171
 172 _test:
 173         addi r2, r1, -8
 174         addi r3, r2, 4
 175         blr
 176
 177 --> important for C++.
 178
 179 ===-------------------------------------------------------------------------===
 180
 181 No loads or stores of the constants should be needed:
 182
 183 struct foo { double X, Y; };
 184 void xxx(struct foo F);
 185 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 186
 187 ===-------------------------------------------------------------------------===
 188
 189 Darwin Stub LICM optimization:
 190
 191 Loops like this:
 192
 193   for (...)  bar();
 194
 195 Have to go through an indirect stub if bar is external or linkonce.  It would
 196 be better to compile it as:
 197
 198      fp = &bar;
 199      for (...)  fp();
 200
 201 which only computes the address of bar once (instead of each time through the
 202 stub).  This is Darwin specific and would have to be done in the code generator.
 203 Probably not a win on x86.
 204
 205 ===-------------------------------------------------------------------------===
 206
 207 Simple IPO for argument passing, change:
 208   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 209
 210 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 211 of arguments get assigned to r3 through r10. That is, if you have a function
 212 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 213 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 214 order for functions we can internalize so that the maximum number of
 215 integers/pointers get passed in regs before you see any of the fp arguments.
 216
 217 Instead of implementing this, it would actually probably be easier to just
 218 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 219 including having this work sanely.
 220
 221 ===-------------------------------------------------------------------------===
 222
 223 Fix Darwin FP-In-Integer Registers ABI
 224
 225 Darwin passes doubles in structures in integer registers, which is very very
 226 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 227 that percolates these things out of functions.
 228
 229 Check out how horrible this is:
 230 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 231
 232 This is an extension of "interprocedural CC unmunging" that can't be done with
 233 just fastcc.
 234
 235 ===-------------------------------------------------------------------------===
 236
 237 Compile this:
 238
 239 int foo(int a) {
 240   int b = (a < 8);
 241   if (b) {
 242     return b * 3;     // ignore the fact that this is always 3.
 243   } else {
 244     return 2;
 245   }
 246 }
 247
 248 into something not this:
 249
 250 _foo:
 251 1)      cmpwi cr7, r3, 8
 252         mfcr r2, 1
 253         rlwinm r2, r2, 29, 31, 31
 254 1)      cmpwi cr0, r3, 7
 255         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 256 LBB1_1: ; then
 257         rlwinm r2, r2, 0, 31, 31
 258         mulli r3, r2, 3
 259         blr
 260 LBB1_2: ; UnifiedReturnBlock
 261         li r3, 2
 262         blr
 263
 264 In particular, the two compares (marked 1) could be shared by reversing one.
 265 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 266 same operands (but backwards) exists.  In this case, this wouldn't save us
 267 anything though, because the compares still wouldn't be shared.
 268
 269 ===-------------------------------------------------------------------------===
 270
 271 We should custom expand setcc instead of pretending that we have it.  That
 272 would allow us to expose the access of the crbit after the mfcr, allowing
 273 that access to be trivially folded into other ops.  A simple example:
 274
 275 int foo(int a, int b) { return (a < b) << 4; }
 276
 277 compiles into:
 278
 279 _foo:
 280         cmpw cr7, r3, r4
 281         mfcr r2, 1
 282         rlwinm r2, r2, 29, 31, 31
 283         slwi r3, r2, 4
 284         blr
 285
 286 ===-------------------------------------------------------------------------===
 287
 288 Fold add and sub with constant into non-extern, non-weak addresses so this:
 289
 290 static int a;
 291 void bar(int b) { a = b; }
 292 void foo(unsigned char *c) {
 293   *c = a;
 294 }
 295
 296 So that
 297
 298 _foo:
 299         lis r2, ha16(_a)
 300         la r2, lo16(_a)(r2)
 301         lbz r2, 3(r2)
 302         stb r2, 0(r3)
 303         blr
 304
 305 Becomes
 306
 307 _foo:
 308         lis r2, ha16(_a+3)
 309         lbz r2, lo16(_a+3)(r2)
 310         stb r2, 0(r3)
 311         blr
 312
 313 ===-------------------------------------------------------------------------===
 314
 315 We generate really bad code for this:
 316
 317 int f(signed char *a, _Bool b, _Bool c) {
 318    signed char t = 0;
 319   if (b)  t = *a;
 320   if (c)  *a = t;
 321 }
 322
 323 ===-------------------------------------------------------------------------===
 324
 325 This:
 326 int test(unsigned *P) { return *P >> 24; }
 327
 328 Should compile to:
 329
 330 _test:
 331         lbz r3,0(r3)
 332         blr
 333
 334 not:
 335
 336 _test:
 337         lwz r2, 0(r3)
 338         srwi r3, r2, 24
 339         blr
 340
 341 ===-------------------------------------------------------------------------===
 342
 343 On the G5, logical CR operations are more expensive in their three
 344 address form: ops that read/write the same register are half as expensive as
 345 those that read from two registers that are different from their destination.
 346
 347 We should model this with two separate instructions.  The isel should generate
 348 the "two address" form of the instructions.  When the register allocator
 349 detects that it needs to insert a copy due to the two-addresness of the CR
 350 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 351 we can convert to the "three address" instruction, to save code space.
 352
 353 This only matters when we start generating cr logical ops.
 354
 355 ===-------------------------------------------------------------------------===
 356
 357 We should compile these two functions to the same thing:
 358
 359 #include <stdlib.h>
 360 void f(int a, int b, int *P) {
 361   *P = (a-b)>=0?(a-b):(b-a);
 362 }
 363 void g(int a, int b, int *P) {
 364   *P = abs(a-b);
 365 }
 366
 367 Further, they should compile to something better than:
 368
 369 _g:
 370         subf r2, r4, r3
 371         subfic r3, r2, 0
 372         cmpwi cr0, r2, -1
 373         bgt cr0, LBB2_2 ; entry
 374 LBB2_1: ; entry
 375         mr r2, r3
 376 LBB2_2: ; entry
 377         stw r2, 0(r5)
 378         blr
 379
 380 GCC produces:
 381
 382 _g:
 383         subf r4,r4,r3
 384         srawi r2,r4,31
 385         xor r0,r2,r4
 386         subf r0,r2,r0
 387         stw r0,0(r5)
 388         blr
 389
 390 ... which is much nicer.
 391
 392 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 393
 394 ===-------------------------------------------------------------------------===
 395
 396 int foo(int N, int ***W, int **TK, int X) {
 397   int t, i;
 398
 399   for (t = 0; t < N; ++t)
 400     for (i = 0; i < 4; ++i)
 401       W[t / X][i][t % X] = TK[i][t];
 402
 403   return 5;
 404 }
 405
 406 We generate relatively atrocious code for this loop compared to gcc.
 407
 408 We could also strength reduce the rem and the div:
 409 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 410
 411 ===-------------------------------------------------------------------------===
 412
 413 float foo(float X) { return (int)(X); }
 414
 415 Currently produces:
 416
 417 _foo:
 418         fctiwz f0, f1
 419         stfd f0, -8(r1)
 420         lwz r2, -4(r1)
 421         extsw r2, r2
 422         std r2, -16(r1)
 423         lfd f0, -16(r1)
 424         fcfid f0, f0
 425         frsp f1, f0
 426         blr
 427
 428 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 429 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 430 win only.
 431
 432 ===-------------------------------------------------------------------------===
 433
 434 We generate ugly code for this:
 435
 436 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 437   unsigned code = 0;
 438   if(dx < -dw) code |= 1;
 439   if(dx > dw)  code |= 2;
 440   if(dy < -dw) code |= 4;
 441   if(dy > dw)  code |= 8;
 442   if(dz < -dw) code |= 16;
 443   if(dz > dw)  code |= 32;
 444   *ret = code;
 445 }
 446
 447 ===-------------------------------------------------------------------------===
 448
 449 Complete the signed i32 to FP conversion code using 64-bit registers
 450 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 451
 452      // FIXME: disable this lowered code.  This generates 64-bit register values,
 453      // and we don't model the fact that the top part is clobbered by calls.  We
 454      // need to flag these together so that the value isn't live across a call.
 455      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 456
 457 Also, if the registers are spilled to the stack, we have to ensure that all
 458 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 459 sounds like we need to get the 64-bit register classes going.
 460
 461 ===-------------------------------------------------------------------------===
 462
 463 %struct.B = type { i8, [3 x i8] }
 464
 465 define void @bar(%struct.B* %b) {
 466 entry:
 467         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 468         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 469         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 470         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 471         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 472         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 473         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 474         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 475         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 476         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 477         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 478         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 479         store i32 %tmp13, i32* %tmp8
 480         ret void
 481 }
 482
 483 We emit:
 484
 485 _foo:
 486         lwz r2, 0(r3)
 487         slwi r4, r2, 1
 488         or r4, r4, r2
 489         rlwimi r2, r4, 0, 0, 0
 490         stw r2, 0(r3)
 491         blr
 492
 493 We could collapse a bunch of those ORs and ANDs and generate the following
 494 equivalent code:
 495
 496 _foo:
 497         lwz r2, 0(r3)
 498         rlwinm r4, r2, 1, 0, 0
 499         or r2, r2, r4
 500         stw r2, 0(r3)
 501         blr
 502
 503 ===-------------------------------------------------------------------------===
 504
 505 We compile:
 506
 507 unsigned test6(unsigned x) {
 508   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 509 }
 510
 511 into:
 512
 513 _test6:
 514         lis r2, 255
 515         rlwinm r3, r3, 16, 0, 31
 516         ori r2, r2, 255
 517         and r3, r3, r2
 518         blr
 519
 520 GCC gets it down to:
 521
 522 _test6:
 523         rlwinm r0,r3,16,8,15
 524         rlwinm r3,r3,16,24,31
 525         or r3,r3,r0
 526         blr
 527
 528
 529 ===-------------------------------------------------------------------------===
 530
 531 Consider a function like this:
 532
 533 float foo(float X) { return X + 1234.4123f; }
 534
 535 The FP constant ends up in the constant pool, so we need to get the LR register.
 536  This ends up producing code like this:
 537
 538 _foo:
 539 .LBB_foo_0:     ; entry
 540         mflr r11
 541 ***     stw r11, 8(r1)
 542         bl "L00000$pb"
 543 "L00000$pb":
 544         mflr r2
 545         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 546         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 547         fadds f1, f1, f0
 548 ***     lwz r11, 8(r1)
 549         mtlr r11
 550         blr
 551
 552 This is functional, but there is no reason to spill the LR register all the way
 553 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 554
 555 Implementing this will require some codegen improvements.  Nate writes:
 556
 557 "So basically what we need to support the "no stack frame save and restore" is a
 558 generalization of the LR optimization to "callee-save regs".
 559
 560 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 561 that it's callee save, and spills it directly to the stack.
 562
 563 Ideally, something like this would happen:
 564
 565 LR would be in a separate register class from the GPRs. The class of LR would be
 566 marked "unspillable".  When the register allocator came across an unspillable
 567 reg, it would ask "what is the best class to copy this into that I *can* spill"
 568 If it gets a class back, which it will in this case (the gprs), it grabs a free
 569 register of that class.  If it is then later necessary to spill that reg, so be
 570 it.
 571
 572 ===-------------------------------------------------------------------------===
 573
 574 We compile this:
 575 int test(_Bool X) {
 576   return X ? 524288 : 0;
 577 }
 578
 579 to:
 580 _test:
 581         cmplwi cr0, r3, 0
 582         lis r2, 8
 583         li r3, 0
 584         beq cr0, LBB1_2 ;entry
 585 LBB1_1: ;entry
 586         mr r3, r2
 587 LBB1_2: ;entry
 588         blr
 589
 590 instead of:
 591 _test:
 592         addic r2,r3,-1
 593         subfe r0,r2,r3
 594         slwi r3,r0,19
 595         blr
 596
 597 This sort of thing occurs a lot due to globalopt.
 598
 599 ===-------------------------------------------------------------------------===
 600
 601 We currently compile 32-bit bswap:
 602
 603 declare i32 @llvm.bswap.i32(i32 %A)
 604 define i32 @test(i32 %A) {
 605         %B = call i32 @llvm.bswap.i32(i32 %A)
 606         ret i32 %B
 607 }
 608
 609 to:
 610
 611 _test:
 612         rlwinm r2, r3, 24, 16, 23
 613         slwi r4, r3, 24
 614         rlwimi r2, r3, 8, 24, 31
 615         rlwimi r4, r3, 8, 8, 15
 616         rlwimi r4, r2, 0, 16, 31
 617         mr r3, r4
 618         blr
 619
 620 it would be more efficient to produce:
 621
 622 _foo:   mr r0,r3
 623         rlwinm r3,r3,8,0xffffffff
 624         rlwimi r3,r0,24,0,7
 625         rlwimi r3,r0,24,16,23
 626         blr
 627
 628 ===-------------------------------------------------------------------------===
 629
 630 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
 631
 632 __ZNK4llvm5APInt17countLeadingZerosEv:
 633         ld r2, 0(r3)
 634         cntlzd r2, r2
 635         or r2, r2, r2     <<-- silly.
 636         addi r3, r2, -64
 637         blr
 638
 639 The dead or is a 'truncate' from 64- to 32-bits.
 640
 641 ===-------------------------------------------------------------------------===
 642
 643 We generate horrible ppc code for this:
 644
 645 #define N  2000000
 646 double   a[N],c[N];
 647 void simpleloop() {
 648    int j;
 649    for (j=0; j<N; j++)
 650      c[j] = a[j];
 651 }
 652
 653 LBB1_1: ;bb
 654         lfdx f0, r3, r4
 655         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 656         stfdx f0, r2, r4
 657         addi r4, r4, 8
 658
 659         xoris r6, r5, 30               ;; This is due to a large immediate.
 660         cmplwi cr0, r6, 33920
 661         bne cr0, LBB1_1
 662
 663 ===-------------------------------------------------------------------------===
 664