lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6
   7 ===-------------------------------------------------------------------------===
   8
   9 We only produce the rlwnm instruction for rotate instructions.  We should
  10 at least match stuff like:
  11
  12 unsigned rot_and(unsigned X, int Y) {
  13   unsigned T = (X << Y) | (X >> (32-Y));
  14   T &= 127;
  15   return T;
  16 }
  17
  18 _foo3:
  19         rlwnm r2, r3, r4, 0, 31
  20         rlwinm r3, r2, 0, 25, 31
  21         blr
  22
  23 ... which is the basic pattern that should be written in the instr.  It may
  24 also be useful for stuff like:
  25
  26 long long foo2(long long X, int C) {
  27   return X << (C&~32);
  28 }
  29
  30 which currently produces:
  31
  32 _foo2:
  33         rlwinm r2, r5, 0, 27, 25
  34         subfic r5, r2, 32
  35         slw r3, r3, r2
  36         srw r5, r4, r5
  37         or r3, r3, r5
  38         slw r4, r4, r2
  39         blr
  40
  41 ===-------------------------------------------------------------------------===
  42
  43 Support 'update' load/store instructions.  These are cracked on the G5, but are
  44 still a codesize win.
  45
  46 ===-------------------------------------------------------------------------===
  47
  48 Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
  49 we don't have to always run the branch selector for small functions.
  50
  51 ===-------------------------------------------------------------------------===
  52
  53 Lump the constant pool for each function into ONE pic object, and reference
  54 pieces of it as offsets from the start.  For functions like this (contrived
  55 to have lots of constants obviously):
  56
  57 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  58
  59 We generate:
  60
  61 _X:
  62         lis r2, ha16(.CPI_X_0)
  63         lfd f0, lo16(.CPI_X_0)(r2)
  64         lis r2, ha16(.CPI_X_1)
  65         lfd f2, lo16(.CPI_X_1)(r2)
  66         fmadd f0, f1, f0, f2
  67         lis r2, ha16(.CPI_X_2)
  68         lfd f1, lo16(.CPI_X_2)(r2)
  69         lis r2, ha16(.CPI_X_3)
  70         lfd f2, lo16(.CPI_X_3)(r2)
  71         fmadd f1, f0, f1, f2
  72         blr
  73
  74 It would be better to materialize .CPI_X into a register, then use immediates
  75 off of the register to avoid the lis's.  This is even more important in PIC
  76 mode.
  77
  78 Note that this (and the static variable version) is discussed here for GCC:
  79 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  80
  81 ===-------------------------------------------------------------------------===
  82
  83 PIC Code Gen IPO optimization:
  84
  85 Squish small scalar globals together into a single global struct, allowing the
  86 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  87 of the GOT on targets with one).
  88
  89 Note that this is discussed here for GCC:
  90 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  91
  92 ===-------------------------------------------------------------------------===
  93
  94 Implement Newton-Rhapson method for improving estimate instructions to the
  95 correct accuracy, and implementing divide as multiply by reciprocal when it has
  96 more than one use.  Itanium will want this too.
  97
  98 ===-------------------------------------------------------------------------===
  99
 100 Compile this:
 101
 102 int %f1(int %a, int %b) {
 103         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 104         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 105         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 106         ret int %tmp.4
 107 }
 108
 109 without a copy.  We make this currently:
 110
 111 _f1:
 112         rlwinm r2, r4, 0, 24, 27
 113         rlwimi r2, r3, 0, 28, 31
 114         or r3, r2, r2
 115         blr
 116
 117 The two-addr pass or RA needs to learn when it is profitable to commute an
 118 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 119 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 120
 121 ===-------------------------------------------------------------------------===
 122
 123 Compile offsets from allocas:
 124
 125 int *%test() {
 126         %X = alloca { int, int }
 127         %Y = getelementptr {int,int}* %X, int 0, uint 1
 128         ret int* %Y
 129 }
 130
 131 into a single add, not two:
 132
 133 _test:
 134         addi r2, r1, -8
 135         addi r3, r2, 4
 136         blr
 137
 138 --> important for C++.
 139
 140 ===-------------------------------------------------------------------------===
 141
 142 int test3(int a, int b) { return (a < 0) ? a : 0; }
 143
 144 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 145
 146 ===-------------------------------------------------------------------------===
 147
 148 No loads or stores of the constants should be needed:
 149
 150 struct foo { double X, Y; };
 151 void xxx(struct foo F);
 152 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 153
 154 ===-------------------------------------------------------------------------===
 155
 156 Darwin Stub LICM optimization:
 157
 158 Loops like this:
 159
 160   for (...)  bar();
 161
 162 Have to go through an indirect stub if bar is external or linkonce.  It would
 163 be better to compile it as:
 164
 165      fp = &bar;
 166      for (...)  fp();
 167
 168 which only computes the address of bar once (instead of each time through the
 169 stub).  This is Darwin specific and would have to be done in the code generator.
 170 Probably not a win on x86.
 171
 172 ===-------------------------------------------------------------------------===
 173
 174 PowerPC i1/setcc stuff (depends on subreg stuff):
 175
 176 Check out the PPC code we get for 'compare' in this testcase:
 177 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 178
 179 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 180 invert, invert, or), we then have to compare it against zero instead of
 181 using the value already in a CR!
 182
 183 that should be something like
 184         cmpw cr7, r8, r5
 185         cmpw cr0, r7, r3
 186         crnand cr0, cr0, cr7
 187         bne cr0, LBB_compare_4
 188
 189 instead of
 190         cmpw cr7, r8, r5
 191         cmpw cr0, r7, r3
 192         mfcr r7, 1
 193         mcrf cr7, cr0
 194         mfcr r8, 1
 195         rlwinm r7, r7, 30, 31, 31
 196         rlwinm r8, r8, 30, 31, 31
 197         xori r7, r7, 1
 198         xori r8, r8, 1
 199         addi r2, r2, 1
 200         or r7, r8, r7
 201         cmpwi cr0, r7, 0
 202         bne cr0, LBB_compare_4  ; loopexit
 203
 204 FreeBench/mason has a basic block that looks like this:
 205
 206          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 207          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 208          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 209          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 210          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 211          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 212          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 213          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 214          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 215          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 216          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 217          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 218
 219 This is a particularly important case where handling CRs better will help.
 220
 221 ===-------------------------------------------------------------------------===
 222
 223 Simple IPO for argument passing, change:
 224   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 225
 226 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 227 of arguments get assigned to r3 through r10. That is, if you have a function
 228 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 229 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 230 order for functions we can internalize so that the maximum number of
 231 integers/pointers get passed in regs before you see any of the fp arguments.
 232
 233 Instead of implementing this, it would actually probably be easier to just
 234 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 235 including having this work sanely.
 236
 237 ===-------------------------------------------------------------------------===
 238
 239 Fix Darwin FP-In-Integer Registers ABI
 240
 241 Darwin passes doubles in structures in integer registers, which is very very
 242 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 243 that percolates these things out of functions.
 244
 245 Check out how horrible this is:
 246 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 247
 248 This is an extension of "interprocedural CC unmunging" that can't be done with
 249 just fastcc.
 250
 251 ===-------------------------------------------------------------------------===
 252
 253 Compile this:
 254
 255 int foo(int a) {
 256   int b = (a < 8);
 257   if (b) {
 258     return b * 3;     // ignore the fact that this is always 3.
 259   } else {
 260     return 2;
 261   }
 262 }
 263
 264 into something not this:
 265
 266 _foo:
 267 1)      cmpwi cr7, r3, 8
 268         mfcr r2, 1
 269         rlwinm r2, r2, 29, 31, 31
 270 1)      cmpwi cr0, r3, 7
 271         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 272 LBB1_1: ; then
 273         rlwinm r2, r2, 0, 31, 31
 274         mulli r3, r2, 3
 275         blr
 276 LBB1_2: ; UnifiedReturnBlock
 277         li r3, 2
 278         blr
 279
 280 In particular, the two compares (marked 1) could be shared by reversing one.
 281 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 282 same operands (but backwards) exists.  In this case, this wouldn't save us
 283 anything though, because the compares still wouldn't be shared.
 284
 285 ===-------------------------------------------------------------------------===
 286
 287 The legalizer should lower this:
 288
 289 bool %test(ulong %x) {
 290   %tmp = setlt ulong %x, 4294967296
 291   ret bool %tmp
 292 }
 293
 294 into "if x.high == 0", not:
 295
 296 _test:
 297         addi r2, r3, -1
 298         cntlzw r2, r2
 299         cntlzw r3, r3
 300         srwi r2, r2, 5
 301         srwi r4, r3, 5
 302         li r3, 0
 303         cmpwi cr0, r2, 0
 304         bne cr0, LBB1_2 ;
 305 LBB1_1:
 306         or r3, r4, r4
 307 LBB1_2:
 308         blr
 309
 310 noticed in 2005-05-11-Popcount-ffs-fls.c.
 311
 312
 313 ===-------------------------------------------------------------------------===
 314
 315 We should custom expand setcc instead of pretending that we have it.  That
 316 would allow us to expose the access of the crbit after the mfcr, allowing
 317 that access to be trivially folded into other ops.  A simple example:
 318
 319 int foo(int a, int b) { return (a < b) << 4; }
 320
 321 compiles into:
 322
 323 _foo:
 324         cmpw cr7, r3, r4
 325         mfcr r2, 1
 326         rlwinm r2, r2, 29, 31, 31
 327         slwi r3, r2, 4
 328         blr
 329
 330 ===-------------------------------------------------------------------------===
 331
 332 Fold add and sub with constant into non-extern, non-weak addresses so this:
 333
 334 static int a;
 335 void bar(int b) { a = b; }
 336 void foo(unsigned char *c) {
 337   *c = a;
 338 }
 339
 340 So that
 341
 342 _foo:
 343         lis r2, ha16(_a)
 344         la r2, lo16(_a)(r2)
 345         lbz r2, 3(r2)
 346         stb r2, 0(r3)
 347         blr
 348
 349 Becomes
 350
 351 _foo:
 352         lis r2, ha16(_a+3)
 353         lbz r2, lo16(_a+3)(r2)
 354         stb r2, 0(r3)
 355         blr
 356
 357 ===-------------------------------------------------------------------------===
 358
 359 We generate really bad code for this:
 360
 361 int f(signed char *a, _Bool b, _Bool c) {
 362    signed char t = 0;
 363   if (b)  t = *a;
 364   if (c)  *a = t;
 365 }
 366
 367 ===-------------------------------------------------------------------------===
 368
 369 This:
 370 int test(unsigned *P) { return *P >> 24; }
 371
 372 Should compile to:
 373
 374 _test:
 375         lbz r3,0(r3)
 376         blr
 377
 378 not:
 379
 380 _test:
 381         lwz r2, 0(r3)
 382         srwi r3, r2, 24
 383         blr
 384
 385 ===-------------------------------------------------------------------------===
 386
 387 On the G5, logical CR operations are more expensive in their three
 388 address form: ops that read/write the same register are half as expensive as
 389 those that read from two registers that are different from their destination.
 390
 391 We should model this with two separate instructions.  The isel should generate
 392 the "two address" form of the instructions.  When the register allocator
 393 detects that it needs to insert a copy due to the two-addresness of the CR
 394 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 395 we can convert to the "three address" instruction, to save code space.
 396
 397 This only matters when we start generating cr logical ops.
 398
 399 ===-------------------------------------------------------------------------===
 400
 401 We should compile these two functions to the same thing:
 402
 403 #include <stdlib.h>
 404 void f(int a, int b, int *P) {
 405   *P = (a-b)>=0?(a-b):(b-a);
 406 }
 407 void g(int a, int b, int *P) {
 408   *P = abs(a-b);
 409 }
 410
 411 Further, they should compile to something better than:
 412
 413 _g:
 414         subf r2, r4, r3
 415         subfic r3, r2, 0
 416         cmpwi cr0, r2, -1
 417         bgt cr0, LBB2_2 ; entry
 418 LBB2_1: ; entry
 419         mr r2, r3
 420 LBB2_2: ; entry
 421         stw r2, 0(r5)
 422         blr
 423
 424 GCC produces:
 425
 426 _g:
 427         subf r4,r4,r3
 428         srawi r2,r4,31
 429         xor r0,r2,r4
 430         subf r0,r2,r0
 431         stw r0,0(r5)
 432         blr
 433
 434 ... which is much nicer.
 435
 436 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 437
 438 ===-------------------------------------------------------------------------===
 439
 440 int foo(int N, int ***W, int **TK, int X) {
 441   int t, i;
 442
 443   for (t = 0; t < N; ++t)
 444     for (i = 0; i < 4; ++i)
 445       W[t / X][i][t % X] = TK[i][t];
 446
 447   return 5;
 448 }
 449
 450 We generate relatively atrocious code for this loop compared to gcc.
 451
 452 We could also strength reduce the rem and the div:
 453 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 454
 455 ===-------------------------------------------------------------------------===
 456
 457 float foo(float X) { return (int)(X); }
 458
 459 Currently produces:
 460
 461 _foo:
 462         fctiwz f0, f1
 463         stfd f0, -8(r1)
 464         lwz r2, -4(r1)
 465         extsw r2, r2
 466         std r2, -16(r1)
 467         lfd f0, -16(r1)
 468         fcfid f0, f0
 469         frsp f1, f0
 470         blr
 471
 472 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 473 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 474 win only.
 475
 476 ===-------------------------------------------------------------------------===
 477
 478 We generate ugly code for this:
 479
 480 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 481   unsigned code = 0;
 482   if(dx < -dw) code |= 1;
 483   if(dx > dw)  code |= 2;
 484   if(dy < -dw) code |= 4;
 485   if(dy > dw)  code |= 8;
 486   if(dz < -dw) code |= 16;
 487   if(dz > dw)  code |= 32;
 488   *ret = code;
 489 }
 490
 491 ===-------------------------------------------------------------------------===
 492
 493 Complete the signed i32 to FP conversion code using 64-bit registers
 494 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 495
 496      // FIXME: disable this lowered code.  This generates 64-bit register values,
 497      // and we don't model the fact that the top part is clobbered by calls.  We
 498      // need to flag these together so that the value isn't live across a call.
 499      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 500
 501 Also, if the registers are spilled to the stack, we have to ensure that all
 502 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 503 sounds like we need to get the 64-bit register classes going.
 504
 505 ===-------------------------------------------------------------------------===
 506
 507 %struct.B = type { ubyte, [3 x ubyte] }
 508
 509 void %foo(%struct.B* %b) {
 510 entry:
 511         %tmp = cast %struct.B* %b to uint*              ; <uint*> [#uses=1]
 512         %tmp = load uint* %tmp          ; <uint> [#uses=1]
 513         %tmp3 = cast %struct.B* %b to uint*             ; <uint*> [#uses=1]
 514         %tmp4 = load uint* %tmp3                ; <uint> [#uses=1]
 515         %tmp8 = cast %struct.B* %b to uint*             ; <uint*> [#uses=2]
 516         %tmp9 = load uint* %tmp8                ; <uint> [#uses=1]
 517         %tmp4.mask17 = shl uint %tmp4, ubyte 1          ; <uint> [#uses=1]
 518         %tmp1415 = and uint %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 519         %tmp.masked = and uint %tmp, 2147483648         ; <uint> [#uses=1]
 520         %tmp11 = or uint %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 521         %tmp12 = and uint %tmp9, 2147483647             ; <uint> [#uses=1]
 522         %tmp13 = or uint %tmp12, %tmp11         ; <uint> [#uses=1]
 523         store uint %tmp13, uint* %tmp8
 524         ret void
 525 }
 526
 527 We emit:
 528
 529 _foo:
 530         lwz r2, 0(r3)
 531         slwi r4, r2, 1
 532         or r4, r4, r2
 533         rlwimi r2, r4, 0, 0, 0
 534         stw r2, 0(r3)
 535         blr
 536
 537 We could collapse a bunch of those ORs and ANDs and generate the following
 538 equivalent code:
 539
 540 _foo:
 541         lwz r2, 0(r3)
 542         rlwinm r4, r2, 1, 0, 0
 543         or r2, r2, r4
 544         stw r2, 0(r3)
 545         blr
 546
 547 ===-------------------------------------------------------------------------===
 548
 549 On PPC64, this results in a truncate followed by a truncstore.  These should
 550 be folded together.
 551
 552 unsigned short G;
 553 void foo(unsigned long H) { G = H; }
 554
 555 ===-------------------------------------------------------------------------===
 556
 557 We compile:
 558
 559 unsigned test6(unsigned x) {
 560   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 561 }
 562
 563 into:
 564
 565 _test6:
 566         lis r2, 255
 567         rlwinm r3, r3, 16, 0, 31
 568         ori r2, r2, 255
 569         and r3, r3, r2
 570         blr
 571
 572 GCC gets it down to:
 573
 574 _test6:
 575         rlwinm r0,r3,16,8,15
 576         rlwinm r3,r3,16,24,31
 577         or r3,r3,r0
 578         blr
 579