lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6 * implement powerpc-64 for darwin
   7
   8 ===-------------------------------------------------------------------------===
   9
  10 Support 'update' load/store instructions.  These are cracked on the G5, but are
  11 still a codesize win.
  12
  13 ===-------------------------------------------------------------------------===
  14
  15 Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
  16 we don't have to always run the branch selector for small functions.
  17
  18 ===-------------------------------------------------------------------------===
  19
  20 * Codegen this:
  21
  22    void test2(int X) {
  23      if (X == 0x12345678) bar();
  24    }
  25
  26     as:
  27
  28        xoris r0,r3,0x1234
  29        cmplwi cr0,r0,0x5678
  30        beq cr0,L6
  31
  32     not:
  33
  34         lis r2, 4660
  35         ori r2, r2, 22136
  36         cmpw cr0, r3, r2
  37         bne .LBB_test2_2
  38
  39 ===-------------------------------------------------------------------------===
  40
  41 Lump the constant pool for each function into ONE pic object, and reference
  42 pieces of it as offsets from the start.  For functions like this (contrived
  43 to have lots of constants obviously):
  44
  45 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  46
  47 We generate:
  48
  49 _X:
  50         lis r2, ha16(.CPI_X_0)
  51         lfd f0, lo16(.CPI_X_0)(r2)
  52         lis r2, ha16(.CPI_X_1)
  53         lfd f2, lo16(.CPI_X_1)(r2)
  54         fmadd f0, f1, f0, f2
  55         lis r2, ha16(.CPI_X_2)
  56         lfd f1, lo16(.CPI_X_2)(r2)
  57         lis r2, ha16(.CPI_X_3)
  58         lfd f2, lo16(.CPI_X_3)(r2)
  59         fmadd f1, f0, f1, f2
  60         blr
  61
  62 It would be better to materialize .CPI_X into a register, then use immediates
  63 off of the register to avoid the lis's.  This is even more important in PIC
  64 mode.
  65
  66 Note that this (and the static variable version) is discussed here for GCC:
  67 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  68
  69 ===-------------------------------------------------------------------------===
  70
  71 PIC Code Gen IPO optimization:
  72
  73 Squish small scalar globals together into a single global struct, allowing the
  74 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  75 of the GOT on targets with one).
  76
  77 Note that this is discussed here for GCC:
  78 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  79
  80 ===-------------------------------------------------------------------------===
  81
  82 Implement Newton-Rhapson method for improving estimate instructions to the
  83 correct accuracy, and implementing divide as multiply by reciprocal when it has
  84 more than one use.  Itanium will want this too.
  85
  86 ===-------------------------------------------------------------------------===
  87
  88 #define  ARRAY_LENGTH  16
  89
  90 union bitfield {
  91         struct {
  92 #ifndef __ppc__
  93                 unsigned int                       field0 : 6;
  94                 unsigned int                       field1 : 6;
  95                 unsigned int                       field2 : 6;
  96                 unsigned int                       field3 : 6;
  97                 unsigned int                       field4 : 3;
  98                 unsigned int                       field5 : 4;
  99                 unsigned int                       field6 : 1;
 100 #else
 101                 unsigned int                       field6 : 1;
 102                 unsigned int                       field5 : 4;
 103                 unsigned int                       field4 : 3;
 104                 unsigned int                       field3 : 6;
 105                 unsigned int                       field2 : 6;
 106                 unsigned int                       field1 : 6;
 107                 unsigned int                       field0 : 6;
 108 #endif
 109         } bitfields, bits;
 110         unsigned int    u32All;
 111         signed int      i32All;
 112         float   f32All;
 113 };
 114
 115
 116 typedef struct program_t {
 117         union bitfield    array[ARRAY_LENGTH];
 118     int               size;
 119     int               loaded;
 120 } program;
 121
 122
 123 void AdjustBitfields(program* prog, unsigned int fmt1)
 124 {
 125         prog->array[0].bitfields.field0 = fmt1;
 126         prog->array[0].bitfields.field1 = fmt1 + 1;
 127 }
 128
 129 We currently generate:
 130
 131 _AdjustBitfields:
 132         lwz r2, 0(r3)
 133         addi r5, r4, 1
 134         rlwinm r2, r2, 0, 0, 19
 135         rlwinm r5, r5, 6, 20, 25
 136         rlwimi r2, r4, 0, 26, 31
 137         or r2, r2, r5
 138         stw r2, 0(r3)
 139         blr
 140
 141 We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
 142 turned into rlwimi (rlwimi)
 143
 144 The better codegen would be:
 145
 146 _AdjustBitfields:
 147         lwz r0,0(r3)
 148         rlwinm r4,r4,0,0xff
 149         rlwimi r0,r4,0,26,31
 150         addi r4,r4,1
 151         rlwimi r0,r4,6,20,25
 152         stw r0,0(r3)
 153         blr
 154
 155 ===-------------------------------------------------------------------------===
 156
 157 Compile this:
 158
 159 int %f1(int %a, int %b) {
 160         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 161         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 162         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 163         ret int %tmp.4
 164 }
 165
 166 without a copy.  We make this currently:
 167
 168 _f1:
 169         rlwinm r2, r4, 0, 24, 27
 170         rlwimi r2, r3, 0, 28, 31
 171         or r3, r2, r2
 172         blr
 173
 174 The two-addr pass or RA needs to learn when it is profitable to commute an
 175 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 176 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 177
 178 ===-------------------------------------------------------------------------===
 179
 180 Compile offsets from allocas:
 181
 182 int *%test() {
 183         %X = alloca { int, int }
 184         %Y = getelementptr {int,int}* %X, int 0, uint 1
 185         ret int* %Y
 186 }
 187
 188 into a single add, not two:
 189
 190 _test:
 191         addi r2, r1, -8
 192         addi r3, r2, 4
 193         blr
 194
 195 --> important for C++.
 196
 197 ===-------------------------------------------------------------------------===
 198
 199 int test3(int a, int b) { return (a < 0) ? a : 0; }
 200
 201 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 202
 203 ===-------------------------------------------------------------------------===
 204
 205 No loads or stores of the constants should be needed:
 206
 207 struct foo { double X, Y; };
 208 void xxx(struct foo F);
 209 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 210
 211 ===-------------------------------------------------------------------------===
 212
 213 Darwin Stub LICM optimization:
 214
 215 Loops like this:
 216
 217   for (...)  bar();
 218
 219 Have to go through an indirect stub if bar is external or linkonce.  It would
 220 be better to compile it as:
 221
 222      fp = &bar;
 223      for (...)  fp();
 224
 225 which only computes the address of bar once (instead of each time through the
 226 stub).  This is Darwin specific and would have to be done in the code generator.
 227 Probably not a win on x86.
 228
 229 ===-------------------------------------------------------------------------===
 230
 231 PowerPC i1/setcc stuff (depends on subreg stuff):
 232
 233 Check out the PPC code we get for 'compare' in this testcase:
 234 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 235
 236 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 237 invert, invert, or), we then have to compare it against zero instead of
 238 using the value already in a CR!
 239
 240 that should be something like
 241         cmpw cr7, r8, r5
 242         cmpw cr0, r7, r3
 243         crnand cr0, cr0, cr7
 244         bne cr0, LBB_compare_4
 245
 246 instead of
 247         cmpw cr7, r8, r5
 248         cmpw cr0, r7, r3
 249         mfcr r7, 1
 250         mcrf cr7, cr0
 251         mfcr r8, 1
 252         rlwinm r7, r7, 30, 31, 31
 253         rlwinm r8, r8, 30, 31, 31
 254         xori r7, r7, 1
 255         xori r8, r8, 1
 256         addi r2, r2, 1
 257         or r7, r8, r7
 258         cmpwi cr0, r7, 0
 259         bne cr0, LBB_compare_4  ; loopexit
 260
 261 FreeBench/mason has a basic block that looks like this:
 262
 263          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 264          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 265          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 266          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 267          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 268          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 269          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 270          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 271          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 272          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 273          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 274          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 275
 276 This is a particularly important case where handling CRs better will help.
 277
 278 ===-------------------------------------------------------------------------===
 279
 280 Simple IPO for argument passing, change:
 281   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 282
 283 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 284 of arguments get assigned to r3 through r10. That is, if you have a function
 285 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 286 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 287 order for functions we can internalize so that the maximum number of
 288 integers/pointers get passed in regs before you see any of the fp arguments.
 289
 290 Instead of implementing this, it would actually probably be easier to just
 291 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 292 including having this work sanely.
 293
 294 ===-------------------------------------------------------------------------===
 295
 296 Fix Darwin FP-In-Integer Registers ABI
 297
 298 Darwin passes doubles in structures in integer registers, which is very very
 299 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 300 that percolates these things out of functions.
 301
 302 Check out how horrible this is:
 303 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 304
 305 This is an extension of "interprocedural CC unmunging" that can't be done with
 306 just fastcc.
 307
 308 ===-------------------------------------------------------------------------===
 309
 310 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 311
 312 ===-------------------------------------------------------------------------===
 313
 314 Compile this:
 315
 316 int foo(int a) {
 317   int b = (a < 8);
 318   if (b) {
 319     return b * 3;     // ignore the fact that this is always 3.
 320   } else {
 321     return 2;
 322   }
 323 }
 324
 325 into something not this:
 326
 327 _foo:
 328 1)      cmpwi cr7, r3, 8
 329         mfcr r2, 1
 330         rlwinm r2, r2, 29, 31, 31
 331 1)      cmpwi cr0, r3, 7
 332         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 333 LBB1_1: ; then
 334         rlwinm r2, r2, 0, 31, 31
 335         mulli r3, r2, 3
 336         blr
 337 LBB1_2: ; UnifiedReturnBlock
 338         li r3, 2
 339         blr
 340
 341 In particular, the two compares (marked 1) could be shared by reversing one.
 342 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 343 same operands (but backwards) exists.  In this case, this wouldn't save us
 344 anything though, because the compares still wouldn't be shared.
 345
 346 ===-------------------------------------------------------------------------===
 347
 348 The legalizer should lower this:
 349
 350 bool %test(ulong %x) {
 351   %tmp = setlt ulong %x, 4294967296
 352   ret bool %tmp
 353 }
 354
 355 into "if x.high == 0", not:
 356
 357 _test:
 358         addi r2, r3, -1
 359         cntlzw r2, r2
 360         cntlzw r3, r3
 361         srwi r2, r2, 5
 362         srwi r4, r3, 5
 363         li r3, 0
 364         cmpwi cr0, r2, 0
 365         bne cr0, LBB1_2 ;
 366 LBB1_1:
 367         or r3, r4, r4
 368 LBB1_2:
 369         blr
 370
 371 noticed in 2005-05-11-Popcount-ffs-fls.c.
 372
 373
 374 ===-------------------------------------------------------------------------===
 375
 376 We should custom expand setcc instead of pretending that we have it.  That
 377 would allow us to expose the access of the crbit after the mfcr, allowing
 378 that access to be trivially folded into other ops.  A simple example:
 379
 380 int foo(int a, int b) { return (a < b) << 4; }
 381
 382 compiles into:
 383
 384 _foo:
 385         cmpw cr7, r3, r4
 386         mfcr r2, 1
 387         rlwinm r2, r2, 29, 31, 31
 388         slwi r3, r2, 4
 389         blr
 390
 391 ===-------------------------------------------------------------------------===
 392
 393 Fold add and sub with constant into non-extern, non-weak addresses so this:
 394
 395 static int a;
 396 void bar(int b) { a = b; }
 397 void foo(unsigned char *c) {
 398   *c = a;
 399 }
 400
 401 So that
 402
 403 _foo:
 404         lis r2, ha16(_a)
 405         la r2, lo16(_a)(r2)
 406         lbz r2, 3(r2)
 407         stb r2, 0(r3)
 408         blr
 409
 410 Becomes
 411
 412 _foo:
 413         lis r2, ha16(_a+3)
 414         lbz r2, lo16(_a+3)(r2)
 415         stb r2, 0(r3)
 416         blr
 417
 418 ===-------------------------------------------------------------------------===
 419
 420 We generate really bad code for this:
 421
 422 int f(signed char *a, _Bool b, _Bool c) {
 423    signed char t = 0;
 424   if (b)  t = *a;
 425   if (c)  *a = t;
 426 }
 427
 428 ===-------------------------------------------------------------------------===
 429
 430 This:
 431 int test(unsigned *P) { return *P >> 24; }
 432
 433 Should compile to:
 434
 435 _test:
 436         lbz r3,0(r3)
 437         blr
 438
 439 not:
 440
 441 _test:
 442         lwz r2, 0(r3)
 443         srwi r3, r2, 24
 444         blr
 445
 446 ===-------------------------------------------------------------------------===
 447
 448 On the G5, logical CR operations are more expensive in their three
 449 address form: ops that read/write the same register are half as expensive as
 450 those that read from two registers that are different from their destination.
 451
 452 We should model this with two separate instructions.  The isel should generate
 453 the "two address" form of the instructions.  When the register allocator
 454 detects that it needs to insert a copy due to the two-addresness of the CR
 455 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 456 we can convert to the "three address" instruction, to save code space.
 457
 458 This only matters when we start generating cr logical ops.
 459
 460 ===-------------------------------------------------------------------------===
 461
 462 We should compile these two functions to the same thing:
 463
 464 #include <stdlib.h>
 465 void f(int a, int b, int *P) {
 466   *P = (a-b)>=0?(a-b):(b-a);
 467 }
 468 void g(int a, int b, int *P) {
 469   *P = abs(a-b);
 470 }
 471
 472 Further, they should compile to something better than:
 473
 474 _g:
 475         subf r2, r4, r3
 476         subfic r3, r2, 0
 477         cmpwi cr0, r2, -1
 478         bgt cr0, LBB2_2 ; entry
 479 LBB2_1: ; entry
 480         mr r2, r3
 481 LBB2_2: ; entry
 482         stw r2, 0(r5)
 483         blr
 484
 485 GCC produces:
 486
 487 _g:
 488         subf r4,r4,r3
 489         srawi r2,r4,31
 490         xor r0,r2,r4
 491         subf r0,r2,r0
 492         stw r0,0(r5)
 493         blr
 494
 495 ... which is much nicer.
 496
 497 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 498
 499 ===-------------------------------------------------------------------------===
 500
 501 int foo(int N, int ***W, int **TK, int X) {
 502   int t, i;
 503
 504   for (t = 0; t < N; ++t)
 505     for (i = 0; i < 4; ++i)
 506       W[t / X][i][t % X] = TK[i][t];
 507
 508   return 5;
 509 }
 510
 511 We generate relatively atrocious code for this loop compared to gcc.
 512
 513 We could also strength reduce the rem and the div:
 514 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 515
 516 ===-------------------------------------------------------------------------===
 517
 518 float foo(float X) { return (int)(X); }
 519
 520 Currently produces:
 521
 522 _foo:
 523         fctiwz f0, f1
 524         stfd f0, -8(r1)
 525         lwz r2, -4(r1)
 526         extsw r2, r2
 527         std r2, -16(r1)
 528         lfd f0, -16(r1)
 529         fcfid f0, f0
 530         frsp f1, f0
 531         blr
 532
 533 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 534 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 535 win only.
 536
 537 ===-------------------------------------------------------------------------===
 538
 539 We generate ugly code for this:
 540
 541 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 542   unsigned code = 0;
 543   if(dx < -dw) code |= 1;
 544   if(dx > dw)  code |= 2;
 545   if(dy < -dw) code |= 4;
 546   if(dy > dw)  code |= 8;
 547   if(dz < -dw) code |= 16;
 548   if(dz > dw)  code |= 32;
 549   *ret = code;
 550 }
 551
 552 ===-------------------------------------------------------------------------===
 553
 554 Complete the signed i32 to FP conversion code using 64-bit registers
 555 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 556
 557      // FIXME: disable this lowered code.  This generates 64-bit register values,
 558      // and we don't model the fact that the top part is clobbered by calls.  We
 559      // need to flag these together so that the value isn't live across a call.
 560      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 561