lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Support 'update' load/store instructions.  These are cracked on the G5, but are
   9 still a codesize win.
  10
  11 ===-------------------------------------------------------------------------===
  12
  13 Should hint to the branch select pass that it doesn't need to print the second
  14 unconditional branch, so we don't end up with things like:
  15         b .LBBl42__2E_expand_function_8_674     ; loopentry.24
  16         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  17         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  18
  19 This occurs in SPASS.
  20
  21 The power of diet coke came up with a solution to this today:
  22
  23 We know the only two cases that can happen here are either:
  24 a) we have a conditional branch followed by a fallthrough to the next BB
  25 b) we have a conditional branch followed by an unconditional branch
  26
  27 We also invented the BRTWOWAY node to model (b).
  28
  29 Currently, these are modeled by the PPC_BRCOND node which is a 12-byte pseudo
  30 that codegens to
  31   bccinv false
  32 true:
  33   b truebb
  34 false:
  35   b falsebb
  36
  37 However, realizing that for (a), we can bccinv directly to the fallthrough
  38 block, and for (b) we will already have another unconditional branch after
  39 the conditional branch (see SPASS case above), then we know that we don't need
  40 BRTWOWAY at all, and can just codegen PPC_BRCOND as
  41
  42 bccinv +8
  43 b truebb
  44
  45 This will also allow us to selectively not run the ppc branch selector, by just
  46 selecting PPC_BRCOND pseudo directly to the correct conditional branch
  47 instruction for small functions.
  48
  49 ===-------------------------------------------------------------------------===
  50
  51 * Codegen this:
  52
  53    void test2(int X) {
  54      if (X == 0x12345678) bar();
  55    }
  56
  57     as:
  58
  59        xoris r0,r3,0x1234
  60        cmplwi cr0,r0,0x5678
  61        beq cr0,L6
  62
  63     not:
  64
  65         lis r2, 4660
  66         ori r2, r2, 22136
  67         cmpw cr0, r3, r2
  68         bne .LBB_test2_2
  69
  70 ===-------------------------------------------------------------------------===
  71
  72 Lump the constant pool for each function into ONE pic object, and reference
  73 pieces of it as offsets from the start.  For functions like this (contrived
  74 to have lots of constants obviously):
  75
  76 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  77
  78 We generate:
  79
  80 _X:
  81         lis r2, ha16(.CPI_X_0)
  82         lfd f0, lo16(.CPI_X_0)(r2)
  83         lis r2, ha16(.CPI_X_1)
  84         lfd f2, lo16(.CPI_X_1)(r2)
  85         fmadd f0, f1, f0, f2
  86         lis r2, ha16(.CPI_X_2)
  87         lfd f1, lo16(.CPI_X_2)(r2)
  88         lis r2, ha16(.CPI_X_3)
  89         lfd f2, lo16(.CPI_X_3)(r2)
  90         fmadd f1, f0, f1, f2
  91         blr
  92
  93 It would be better to materialize .CPI_X into a register, then use immediates
  94 off of the register to avoid the lis's.  This is even more important in PIC
  95 mode.
  96
  97 Note that this (and the static variable version) is discussed here for GCC:
  98 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  99
 100 ===-------------------------------------------------------------------------===
 101
 102 PIC Code Gen IPO optimization:
 103
 104 Squish small scalar globals together into a single global struct, allowing the
 105 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 106 of the GOT on targets with one).
 107
 108 Note that this is discussed here for GCC:
 109 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 110
 111 ===-------------------------------------------------------------------------===
 112
 113 Implement Newton-Rhapson method for improving estimate instructions to the
 114 correct accuracy, and implementing divide as multiply by reciprocal when it has
 115 more than one use.  Itanium will want this too.
 116
 117 ===-------------------------------------------------------------------------===
 118
 119 #define  ARRAY_LENGTH  16
 120
 121 union bitfield {
 122         struct {
 123 #ifndef __ppc__
 124                 unsigned int                       field0 : 6;
 125                 unsigned int                       field1 : 6;
 126                 unsigned int                       field2 : 6;
 127                 unsigned int                       field3 : 6;
 128                 unsigned int                       field4 : 3;
 129                 unsigned int                       field5 : 4;
 130                 unsigned int                       field6 : 1;
 131 #else
 132                 unsigned int                       field6 : 1;
 133                 unsigned int                       field5 : 4;
 134                 unsigned int                       field4 : 3;
 135                 unsigned int                       field3 : 6;
 136                 unsigned int                       field2 : 6;
 137                 unsigned int                       field1 : 6;
 138                 unsigned int                       field0 : 6;
 139 #endif
 140         } bitfields, bits;
 141         unsigned int    u32All;
 142         signed int      i32All;
 143         float   f32All;
 144 };
 145
 146
 147 typedef struct program_t {
 148         union bitfield    array[ARRAY_LENGTH];
 149     int               size;
 150     int               loaded;
 151 } program;
 152
 153
 154 void AdjustBitfields(program* prog, unsigned int fmt1)
 155 {
 156         prog->array[0].bitfields.field0 = fmt1;
 157         prog->array[0].bitfields.field1 = fmt1 + 1;
 158 }
 159
 160 We currently generate:
 161
 162 _AdjustBitfields:
 163         lwz r2, 0(r3)
 164         addi r5, r4, 1
 165         rlwinm r2, r2, 0, 0, 19
 166         rlwinm r5, r5, 6, 20, 25
 167         rlwimi r2, r4, 0, 26, 31
 168         or r2, r2, r5
 169         stw r2, 0(r3)
 170         blr
 171
 172 We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
 173 turned into rlwimi (rlwimi)
 174
 175 The better codegen would be:
 176
 177 _AdjustBitfields:
 178         lwz r0,0(r3)
 179         rlwinm r4,r4,0,0xff
 180         rlwimi r0,r4,0,26,31
 181         addi r4,r4,1
 182         rlwimi r0,r4,6,20,25
 183         stw r0,0(r3)
 184         blr
 185
 186 ===-------------------------------------------------------------------------===
 187
 188 Compile this:
 189
 190 int %f1(int %a, int %b) {
 191         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 192         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 193         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 194         ret int %tmp.4
 195 }
 196
 197 without a copy.  We make this currently:
 198
 199 _f1:
 200         rlwinm r2, r4, 0, 24, 27
 201         rlwimi r2, r3, 0, 28, 31
 202         or r3, r2, r2
 203         blr
 204
 205 The two-addr pass or RA needs to learn when it is profitable to commute an
 206 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 207 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 208
 209 ===-------------------------------------------------------------------------===
 210
 211 Compile offsets from allocas:
 212
 213 int *%test() {
 214         %X = alloca { int, int }
 215         %Y = getelementptr {int,int}* %X, int 0, uint 1
 216         ret int* %Y
 217 }
 218
 219 into a single add, not two:
 220
 221 _test:
 222         addi r2, r1, -8
 223         addi r3, r2, 4
 224         blr
 225
 226 --> important for C++.
 227
 228 ===-------------------------------------------------------------------------===
 229
 230 int test3(int a, int b) { return (a < 0) ? a : 0; }
 231
 232 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 233
 234 ===-------------------------------------------------------------------------===
 235
 236 No loads or stores of the constants should be needed:
 237
 238 struct foo { double X, Y; };
 239 void xxx(struct foo F);
 240 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 241
 242 ===-------------------------------------------------------------------------===
 243
 244 Darwin Stub LICM optimization:
 245
 246 Loops like this:
 247
 248   for (...)  bar();
 249
 250 Have to go through an indirect stub if bar is external or linkonce.  It would
 251 be better to compile it as:
 252
 253      fp = &bar;
 254      for (...)  fp();
 255
 256 which only computes the address of bar once (instead of each time through the
 257 stub).  This is Darwin specific and would have to be done in the code generator.
 258 Probably not a win on x86.
 259
 260 ===-------------------------------------------------------------------------===
 261
 262 PowerPC i1/setcc stuff (depends on subreg stuff):
 263
 264 Check out the PPC code we get for 'compare' in this testcase:
 265 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 266
 267 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 268 invert, invert, or), we then have to compare it against zero instead of
 269 using the value already in a CR!
 270
 271 that should be something like
 272         cmpw cr7, r8, r5
 273         cmpw cr0, r7, r3
 274         crnand cr0, cr0, cr7
 275         bne cr0, LBB_compare_4
 276
 277 instead of
 278         cmpw cr7, r8, r5
 279         cmpw cr0, r7, r3
 280         mfcr r7, 1
 281         mcrf cr7, cr0
 282         mfcr r8, 1
 283         rlwinm r7, r7, 30, 31, 31
 284         rlwinm r8, r8, 30, 31, 31
 285         xori r7, r7, 1
 286         xori r8, r8, 1
 287         addi r2, r2, 1
 288         or r7, r8, r7
 289         cmpwi cr0, r7, 0
 290         bne cr0, LBB_compare_4  ; loopexit
 291
 292 FreeBench/mason has a basic block that looks like this:
 293
 294          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 295          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 296          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 297          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 298          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 299          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 300          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 301          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 302          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 303          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 304          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 305          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 306
 307 This is a particularly important case where handling CRs better will help.
 308
 309 ===-------------------------------------------------------------------------===
 310
 311 Simple IPO for argument passing, change:
 312   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 313
 314 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 315 of arguments get assigned to r3 through r10. That is, if you have a function
 316 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 317 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 318 order for functions we can internalize so that the maximum number of
 319 integers/pointers get passed in regs before you see any of the fp arguments.
 320
 321 Instead of implementing this, it would actually probably be easier to just
 322 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 323 including having this work sanely.
 324
 325 ===-------------------------------------------------------------------------===
 326
 327 Fix Darwin FP-In-Integer Registers ABI
 328
 329 Darwin passes doubles in structures in integer registers, which is very very
 330 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 331 that percolates these things out of functions.
 332
 333 Check out how horrible this is:
 334 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 335
 336 This is an extension of "interprocedural CC unmunging" that can't be done with
 337 just fastcc.
 338
 339 ===-------------------------------------------------------------------------===
 340
 341 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 342
 343 ===-------------------------------------------------------------------------===
 344
 345 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 346 TargetConstantVec's if it's one of the many forms that are algorithmically
 347 computable using the spiffy altivec instructions.
 348
 349 ===-------------------------------------------------------------------------===
 350
 351 Compile this:
 352
 353 int foo(int a) {
 354   int b = (a < 8);
 355   if (b) {
 356     return b * 3;     // ignore the fact that this is always 3.
 357   } else {
 358     return 2;
 359   }
 360 }
 361
 362 into something not this:
 363
 364 _foo:
 365 1)      cmpwi cr7, r3, 8
 366         mfcr r2, 1
 367         rlwinm r2, r2, 29, 31, 31
 368 1)      cmpwi cr0, r3, 7
 369         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 370 LBB1_1: ; then
 371         rlwinm r2, r2, 0, 31, 31
 372         mulli r3, r2, 3
 373         blr
 374 LBB1_2: ; UnifiedReturnBlock
 375         li r3, 2
 376         blr
 377
 378 In particular, the two compares (marked 1) could be shared by reversing one.
 379 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 380 same operands (but backwards) exists.  In this case, this wouldn't save us
 381 anything though, because the compares still wouldn't be shared.
 382
 383 ===-------------------------------------------------------------------------===
 384
 385 The legalizer should lower this:
 386
 387 bool %test(ulong %x) {
 388   %tmp = setlt ulong %x, 4294967296
 389   ret bool %tmp
 390 }
 391
 392 into "if x.high == 0", not:
 393
 394 _test:
 395         addi r2, r3, -1
 396         cntlzw r2, r2
 397         cntlzw r3, r3
 398         srwi r2, r2, 5
 399         srwi r4, r3, 5
 400         li r3, 0
 401         cmpwi cr0, r2, 0
 402         bne cr0, LBB1_2 ;
 403 LBB1_1:
 404         or r3, r4, r4
 405 LBB1_2:
 406         blr
 407
 408 noticed in 2005-05-11-Popcount-ffs-fls.c.
 409
 410
 411 ===-------------------------------------------------------------------------===
 412
 413 We should custom expand setcc instead of pretending that we have it.  That
 414 would allow us to expose the access of the crbit after the mfcr, allowing
 415 that access to be trivially folded into other ops.  A simple example:
 416
 417 int foo(int a, int b) { return (a < b) << 4; }
 418
 419 compiles into:
 420
 421 _foo:
 422         cmpw cr7, r3, r4
 423         mfcr r2, 1
 424         rlwinm r2, r2, 29, 31, 31
 425         slwi r3, r2, 4
 426         blr
 427
 428 ===-------------------------------------------------------------------------===
 429
 430 Fold add and sub with constant into non-extern, non-weak addresses so this:
 431
 432 static int a;
 433 void bar(int b) { a = b; }
 434 void foo(unsigned char *c) {
 435   *c = a;
 436 }
 437
 438 So that
 439
 440 _foo:
 441         lis r2, ha16(_a)
 442         la r2, lo16(_a)(r2)
 443         lbz r2, 3(r2)
 444         stb r2, 0(r3)
 445         blr
 446
 447 Becomes
 448
 449 _foo:
 450         lis r2, ha16(_a+3)
 451         lbz r2, lo16(_a+3)(r2)
 452         stb r2, 0(r3)
 453         blr
 454
 455 ===-------------------------------------------------------------------------===
 456
 457 We generate really bad code for this:
 458
 459 int f(signed char *a, _Bool b, _Bool c) {
 460    signed char t = 0;
 461   if (b)  t = *a;
 462   if (c)  *a = t;
 463 }
 464
 465 ===-------------------------------------------------------------------------===
 466
 467 This:
 468 int test(unsigned *P) { return *P >> 24; }
 469
 470 Should compile to:
 471
 472 _test:
 473         lbz r3,0(r3)
 474         blr
 475
 476 not:
 477
 478 _test:
 479         lwz r2, 0(r3)
 480         srwi r3, r2, 24
 481         blr
 482
 483 ===-------------------------------------------------------------------------===
 484
 485 On the G5, logical CR operations are more expensive in their three
 486 address form: ops that read/write the same register are half as expensive as
 487 those that read from two registers that are different from their destination.
 488
 489 We should model this with two separate instructions.  The isel should generate
 490 the "two address" form of the instructions.  When the register allocator
 491 detects that it needs to insert a copy due to the two-addresness of the CR
 492 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 493 we can convert to the "three address" instruction, to save code space.
 494
 495 This only matters when we start generating cr logical ops.
 496
 497 ===-------------------------------------------------------------------------===
 498
 499 We should compile these two functions to the same thing:
 500
 501 #include <stdlib.h>
 502 void f(int a, int b, int *P) {
 503   *P = (a-b)>=0?(a-b):(b-a);
 504 }
 505 void g(int a, int b, int *P) {
 506   *P = abs(a-b);
 507 }
 508
 509 Further, they should compile to something better than:
 510
 511 _g:
 512         subf r2, r4, r3
 513         subfic r3, r2, 0
 514         cmpwi cr0, r2, -1
 515         bgt cr0, LBB2_2 ; entry
 516 LBB2_1: ; entry
 517         mr r2, r3
 518 LBB2_2: ; entry
 519         stw r2, 0(r5)
 520         blr
 521
 522 GCC produces:
 523
 524 _g:
 525         subf r4,r4,r3
 526         srawi r2,r4,31
 527         xor r0,r2,r4
 528         subf r0,r2,r0
 529         stw r0,0(r5)
 530         blr
 531
 532 ... which is much nicer.
 533
 534 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 535
 536 ===-------------------------------------------------------------------------===
 537
 538 Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
 539 registers, to generate better spill code.
 540
 541 ===-------------------------------------------------------------------------===
 542 int foo(int N, int ***W, int **TK, int X) {
 543   int t, i;
 544
 545   for (t = 0; t < N; ++t)
 546     for (i = 0; i < 4; ++i)
 547       W[t / X][i][t % X] = TK[i][t];
 548
 549   return 5;
 550 }
 551
 552 We generate relatively atrocious code for this loop compared to gcc.
 553
 554
 555