lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6
   7 ===-------------------------------------------------------------------------===
   8
   9 Support 'update' load/store instructions.  These are cracked on the G5, but are
  10 still a codesize win.
  11
  12 With preinc enabled, this:
  13
  14 long *%test4(long *%X, long *%dest) {
  15         %Y = getelementptr long* %X, int 4
  16         %A = load long* %Y
  17         store long %A, long* %dest
  18         ret long* %Y
  19 }
  20
  21 compiles to:
  22
  23 _test4:
  24         mr r2, r3
  25         lwzu r5, 32(r2)
  26         lwz r3, 36(r3)
  27         stw r5, 0(r4)
  28         stw r3, 4(r4)
  29         mr r3, r2
  30         blr
  31
  32 with -sched=list-burr, I get:
  33
  34 _test4:
  35         lwz r2, 36(r3)
  36         lwzu r5, 32(r3)
  37         stw r2, 4(r4)
  38         stw r5, 0(r4)
  39         blr
  40
  41 ===-------------------------------------------------------------------------===
  42
  43 We compile the hottest inner loop of viterbi to:
  44
  45         li r6, 0
  46         b LBB1_84       ;bb432.i
  47 LBB1_83:        ;bb420.i
  48         lbzx r8, r5, r7
  49         addi r6, r7, 1
  50         stbx r8, r4, r7
  51 LBB1_84:        ;bb432.i
  52         mr r7, r6
  53         cmplwi cr0, r7, 143
  54         bne cr0, LBB1_83        ;bb420.i
  55
  56 The CBE manages to produce:
  57
  58         li r0, 143
  59         mtctr r0
  60 loop:
  61         lbzx r2, r2, r11
  62         stbx r0, r2, r9
  63         addi r2, r2, 1
  64         bdz later
  65         b loop
  66
  67 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  68 produced this with bdnz, the loop would be a single dispatch group.
  69
  70 ===-------------------------------------------------------------------------===
  71
  72 Compile:
  73
  74 void foo(int *P) {
  75  if (P)  *P = 0;
  76 }
  77
  78 into:
  79
  80 _foo:
  81         cmpwi cr0,r3,0
  82         beqlr cr0
  83         li r0,0
  84         stw r0,0(r3)
  85         blr
  86
  87 This is effectively a simple form of predication.
  88
  89 ===-------------------------------------------------------------------------===
  90
  91 Lump the constant pool for each function into ONE pic object, and reference
  92 pieces of it as offsets from the start.  For functions like this (contrived
  93 to have lots of constants obviously):
  94
  95 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  96
  97 We generate:
  98
  99 _X:
 100         lis r2, ha16(.CPI_X_0)
 101         lfd f0, lo16(.CPI_X_0)(r2)
 102         lis r2, ha16(.CPI_X_1)
 103         lfd f2, lo16(.CPI_X_1)(r2)
 104         fmadd f0, f1, f0, f2
 105         lis r2, ha16(.CPI_X_2)
 106         lfd f1, lo16(.CPI_X_2)(r2)
 107         lis r2, ha16(.CPI_X_3)
 108         lfd f2, lo16(.CPI_X_3)(r2)
 109         fmadd f1, f0, f1, f2
 110         blr
 111
 112 It would be better to materialize .CPI_X into a register, then use immediates
 113 off of the register to avoid the lis's.  This is even more important in PIC
 114 mode.
 115
 116 Note that this (and the static variable version) is discussed here for GCC:
 117 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 118
 119 ===-------------------------------------------------------------------------===
 120
 121 PIC Code Gen IPO optimization:
 122
 123 Squish small scalar globals together into a single global struct, allowing the
 124 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 125 of the GOT on targets with one).
 126
 127 Note that this is discussed here for GCC:
 128 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 129
 130 ===-------------------------------------------------------------------------===
 131
 132 Implement Newton-Rhapson method for improving estimate instructions to the
 133 correct accuracy, and implementing divide as multiply by reciprocal when it has
 134 more than one use.  Itanium will want this too.
 135
 136 ===-------------------------------------------------------------------------===
 137
 138 Compile this:
 139
 140 int %f1(int %a, int %b) {
 141         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 142         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 143         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 144         ret int %tmp.4
 145 }
 146
 147 without a copy.  We make this currently:
 148
 149 _f1:
 150         rlwinm r2, r4, 0, 24, 27
 151         rlwimi r2, r3, 0, 28, 31
 152         or r3, r2, r2
 153         blr
 154
 155 The two-addr pass or RA needs to learn when it is profitable to commute an
 156 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 157 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 158
 159 ===-------------------------------------------------------------------------===
 160
 161 Compile offsets from allocas:
 162
 163 int *%test() {
 164         %X = alloca { int, int }
 165         %Y = getelementptr {int,int}* %X, int 0, uint 1
 166         ret int* %Y
 167 }
 168
 169 into a single add, not two:
 170
 171 _test:
 172         addi r2, r1, -8
 173         addi r3, r2, 4
 174         blr
 175
 176 --> important for C++.
 177
 178 ===-------------------------------------------------------------------------===
 179
 180 No loads or stores of the constants should be needed:
 181
 182 struct foo { double X, Y; };
 183 void xxx(struct foo F);
 184 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 185
 186 ===-------------------------------------------------------------------------===
 187
 188 Darwin Stub LICM optimization:
 189
 190 Loops like this:
 191
 192   for (...)  bar();
 193
 194 Have to go through an indirect stub if bar is external or linkonce.  It would
 195 be better to compile it as:
 196
 197      fp = &bar;
 198      for (...)  fp();
 199
 200 which only computes the address of bar once (instead of each time through the
 201 stub).  This is Darwin specific and would have to be done in the code generator.
 202 Probably not a win on x86.
 203
 204 ===-------------------------------------------------------------------------===
 205
 206 Simple IPO for argument passing, change:
 207   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 208
 209 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 210 of arguments get assigned to r3 through r10. That is, if you have a function
 211 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 212 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 213 order for functions we can internalize so that the maximum number of
 214 integers/pointers get passed in regs before you see any of the fp arguments.
 215
 216 Instead of implementing this, it would actually probably be easier to just
 217 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 218 including having this work sanely.
 219
 220 ===-------------------------------------------------------------------------===
 221
 222 Fix Darwin FP-In-Integer Registers ABI
 223
 224 Darwin passes doubles in structures in integer registers, which is very very
 225 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 226 that percolates these things out of functions.
 227
 228 Check out how horrible this is:
 229 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 230
 231 This is an extension of "interprocedural CC unmunging" that can't be done with
 232 just fastcc.
 233
 234 ===-------------------------------------------------------------------------===
 235
 236 Compile this:
 237
 238 int foo(int a) {
 239   int b = (a < 8);
 240   if (b) {
 241     return b * 3;     // ignore the fact that this is always 3.
 242   } else {
 243     return 2;
 244   }
 245 }
 246
 247 into something not this:
 248
 249 _foo:
 250 1)      cmpwi cr7, r3, 8
 251         mfcr r2, 1
 252         rlwinm r2, r2, 29, 31, 31
 253 1)      cmpwi cr0, r3, 7
 254         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 255 LBB1_1: ; then
 256         rlwinm r2, r2, 0, 31, 31
 257         mulli r3, r2, 3
 258         blr
 259 LBB1_2: ; UnifiedReturnBlock
 260         li r3, 2
 261         blr
 262
 263 In particular, the two compares (marked 1) could be shared by reversing one.
 264 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 265 same operands (but backwards) exists.  In this case, this wouldn't save us
 266 anything though, because the compares still wouldn't be shared.
 267
 268 ===-------------------------------------------------------------------------===
 269
 270 We should custom expand setcc instead of pretending that we have it.  That
 271 would allow us to expose the access of the crbit after the mfcr, allowing
 272 that access to be trivially folded into other ops.  A simple example:
 273
 274 int foo(int a, int b) { return (a < b) << 4; }
 275
 276 compiles into:
 277
 278 _foo:
 279         cmpw cr7, r3, r4
 280         mfcr r2, 1
 281         rlwinm r2, r2, 29, 31, 31
 282         slwi r3, r2, 4
 283         blr
 284
 285 ===-------------------------------------------------------------------------===
 286
 287 Fold add and sub with constant into non-extern, non-weak addresses so this:
 288
 289 static int a;
 290 void bar(int b) { a = b; }
 291 void foo(unsigned char *c) {
 292   *c = a;
 293 }
 294
 295 So that
 296
 297 _foo:
 298         lis r2, ha16(_a)
 299         la r2, lo16(_a)(r2)
 300         lbz r2, 3(r2)
 301         stb r2, 0(r3)
 302         blr
 303
 304 Becomes
 305
 306 _foo:
 307         lis r2, ha16(_a+3)
 308         lbz r2, lo16(_a+3)(r2)
 309         stb r2, 0(r3)
 310         blr
 311
 312 ===-------------------------------------------------------------------------===
 313
 314 We generate really bad code for this:
 315
 316 int f(signed char *a, _Bool b, _Bool c) {
 317    signed char t = 0;
 318   if (b)  t = *a;
 319   if (c)  *a = t;
 320 }
 321
 322 ===-------------------------------------------------------------------------===
 323
 324 This:
 325 int test(unsigned *P) { return *P >> 24; }
 326
 327 Should compile to:
 328
 329 _test:
 330         lbz r3,0(r3)
 331         blr
 332
 333 not:
 334
 335 _test:
 336         lwz r2, 0(r3)
 337         srwi r3, r2, 24
 338         blr
 339
 340 ===-------------------------------------------------------------------------===
 341
 342 On the G5, logical CR operations are more expensive in their three
 343 address form: ops that read/write the same register are half as expensive as
 344 those that read from two registers that are different from their destination.
 345
 346 We should model this with two separate instructions.  The isel should generate
 347 the "two address" form of the instructions.  When the register allocator
 348 detects that it needs to insert a copy due to the two-addresness of the CR
 349 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 350 we can convert to the "three address" instruction, to save code space.
 351
 352 This only matters when we start generating cr logical ops.
 353
 354 ===-------------------------------------------------------------------------===
 355
 356 We should compile these two functions to the same thing:
 357
 358 #include <stdlib.h>
 359 void f(int a, int b, int *P) {
 360   *P = (a-b)>=0?(a-b):(b-a);
 361 }
 362 void g(int a, int b, int *P) {
 363   *P = abs(a-b);
 364 }
 365
 366 Further, they should compile to something better than:
 367
 368 _g:
 369         subf r2, r4, r3
 370         subfic r3, r2, 0
 371         cmpwi cr0, r2, -1
 372         bgt cr0, LBB2_2 ; entry
 373 LBB2_1: ; entry
 374         mr r2, r3
 375 LBB2_2: ; entry
 376         stw r2, 0(r5)
 377         blr
 378
 379 GCC produces:
 380
 381 _g:
 382         subf r4,r4,r3
 383         srawi r2,r4,31
 384         xor r0,r2,r4
 385         subf r0,r2,r0
 386         stw r0,0(r5)
 387         blr
 388
 389 ... which is much nicer.
 390
 391 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 392
 393 ===-------------------------------------------------------------------------===
 394
 395 int foo(int N, int ***W, int **TK, int X) {
 396   int t, i;
 397
 398   for (t = 0; t < N; ++t)
 399     for (i = 0; i < 4; ++i)
 400       W[t / X][i][t % X] = TK[i][t];
 401
 402   return 5;
 403 }
 404
 405 We generate relatively atrocious code for this loop compared to gcc.
 406
 407 We could also strength reduce the rem and the div:
 408 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 409
 410 ===-------------------------------------------------------------------------===
 411
 412 float foo(float X) { return (int)(X); }
 413
 414 Currently produces:
 415
 416 _foo:
 417         fctiwz f0, f1
 418         stfd f0, -8(r1)
 419         lwz r2, -4(r1)
 420         extsw r2, r2
 421         std r2, -16(r1)
 422         lfd f0, -16(r1)
 423         fcfid f0, f0
 424         frsp f1, f0
 425         blr
 426
 427 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 428 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 429 win only.
 430
 431 ===-------------------------------------------------------------------------===
 432
 433 We generate ugly code for this:
 434
 435 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 436   unsigned code = 0;
 437   if(dx < -dw) code |= 1;
 438   if(dx > dw)  code |= 2;
 439   if(dy < -dw) code |= 4;
 440   if(dy > dw)  code |= 8;
 441   if(dz < -dw) code |= 16;
 442   if(dz > dw)  code |= 32;
 443   *ret = code;
 444 }
 445
 446 ===-------------------------------------------------------------------------===
 447
 448 Complete the signed i32 to FP conversion code using 64-bit registers
 449 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 450
 451      // FIXME: disable this lowered code.  This generates 64-bit register values,
 452      // and we don't model the fact that the top part is clobbered by calls.  We
 453      // need to flag these together so that the value isn't live across a call.
 454      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 455
 456 Also, if the registers are spilled to the stack, we have to ensure that all
 457 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 458 sounds like we need to get the 64-bit register classes going.
 459
 460 ===-------------------------------------------------------------------------===
 461
 462 %struct.B = type { i8, [3 x i8] }
 463
 464 define void @bar(%struct.B* %b) {
 465 entry:
 466         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 467         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 468         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 469         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 470         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 471         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 472         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 473         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 474         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 475         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 476         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 477         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 478         store i32 %tmp13, i32* %tmp8
 479         ret void
 480 }
 481
 482 We emit:
 483
 484 _foo:
 485         lwz r2, 0(r3)
 486         slwi r4, r2, 1
 487         or r4, r4, r2
 488         rlwimi r2, r4, 0, 0, 0
 489         stw r2, 0(r3)
 490         blr
 491
 492 We could collapse a bunch of those ORs and ANDs and generate the following
 493 equivalent code:
 494
 495 _foo:
 496         lwz r2, 0(r3)
 497         rlwinm r4, r2, 1, 0, 0
 498         or r2, r2, r4
 499         stw r2, 0(r3)
 500         blr
 501
 502 ===-------------------------------------------------------------------------===
 503
 504 We compile:
 505
 506 unsigned test6(unsigned x) {
 507   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 508 }
 509
 510 into:
 511
 512 _test6:
 513         lis r2, 255
 514         rlwinm r3, r3, 16, 0, 31
 515         ori r2, r2, 255
 516         and r3, r3, r2
 517         blr
 518
 519 GCC gets it down to:
 520
 521 _test6:
 522         rlwinm r0,r3,16,8,15
 523         rlwinm r3,r3,16,24,31
 524         or r3,r3,r0
 525         blr
 526
 527
 528 ===-------------------------------------------------------------------------===
 529
 530 Consider a function like this:
 531
 532 float foo(float X) { return X + 1234.4123f; }
 533
 534 The FP constant ends up in the constant pool, so we need to get the LR register.
 535  This ends up producing code like this:
 536
 537 _foo:
 538 .LBB_foo_0:     ; entry
 539         mflr r11
 540 ***     stw r11, 8(r1)
 541         bl "L00000$pb"
 542 "L00000$pb":
 543         mflr r2
 544         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 545         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 546         fadds f1, f1, f0
 547 ***     lwz r11, 8(r1)
 548         mtlr r11
 549         blr
 550
 551 This is functional, but there is no reason to spill the LR register all the way
 552 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 553
 554 Implementing this will require some codegen improvements.  Nate writes:
 555
 556 "So basically what we need to support the "no stack frame save and restore" is a
 557 generalization of the LR optimization to "callee-save regs".
 558
 559 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 560 that it's callee save, and spills it directly to the stack.
 561
 562 Ideally, something like this would happen:
 563
 564 LR would be in a separate register class from the GPRs. The class of LR would be
 565 marked "unspillable".  When the register allocator came across an unspillable
 566 reg, it would ask "what is the best class to copy this into that I *can* spill"
 567 If it gets a class back, which it will in this case (the gprs), it grabs a free
 568 register of that class.  If it is then later necessary to spill that reg, so be
 569 it.
 570
 571 ===-------------------------------------------------------------------------===
 572
 573 We compile this:
 574 int test(_Bool X) {
 575   return X ? 524288 : 0;
 576 }
 577
 578 to:
 579 _test:
 580         cmplwi cr0, r3, 0
 581         lis r2, 8
 582         li r3, 0
 583         beq cr0, LBB1_2 ;entry
 584 LBB1_1: ;entry
 585         mr r3, r2
 586 LBB1_2: ;entry
 587         blr
 588
 589 instead of:
 590 _test:
 591         addic r2,r3,-1
 592         subfe r0,r2,r3
 593         slwi r3,r0,19
 594         blr
 595
 596 This sort of thing occurs a lot due to globalopt.
 597
 598 ===-------------------------------------------------------------------------===
 599
 600 We currently compile 32-bit bswap:
 601
 602 declare i32 @llvm.bswap.i32(i32 %A)
 603 define i32 @test(i32 %A) {
 604         %B = call i32 @llvm.bswap.i32(i32 %A)
 605         ret i32 %B
 606 }
 607
 608 to:
 609
 610 _test:
 611         rlwinm r2, r3, 24, 16, 23
 612         slwi r4, r3, 24
 613         rlwimi r2, r3, 8, 24, 31
 614         rlwimi r4, r3, 8, 8, 15
 615         rlwimi r4, r2, 0, 16, 31
 616         mr r3, r4
 617         blr
 618
 619 it would be more efficient to produce:
 620
 621 _foo:   mr r0,r3
 622         rlwinm r3,r3,8,0xffffffff
 623         rlwimi r3,r0,24,0,7
 624         rlwimi r3,r0,24,16,23
 625         blr
 626
 627 ===-------------------------------------------------------------------------===