lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Use the stfiwx instruction for:
   9
  10 void foo(float a, int *b) { *b = a; }
  11
  12 ===-------------------------------------------------------------------------===
  13
  14 unsigned short foo(float a) { return a; }
  15 should be:
  16 _foo:
  17         fctiwz f0,f1
  18         stfd f0,-8(r1)
  19         lhz r3,-2(r1)
  20         blr
  21 not:
  22 _foo:
  23         fctiwz f0, f1
  24         stfd f0, -8(r1)
  25         lwz r2, -4(r1)
  26         rlwinm r3, r2, 0, 16, 31
  27         blr
  28
  29 ===-------------------------------------------------------------------------===
  30
  31 Support 'update' load/store instructions.  These are cracked on the G5, but are
  32 still a codesize win.
  33
  34 ===-------------------------------------------------------------------------===
  35
  36 Should hint to the branch select pass that it doesn't need to print the second
  37 unconditional branch, so we don't end up with things like:
  38         b .LBBl42__2E_expand_function_8_674     ; loopentry.24
  39         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  40         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  41
  42 This occurs in SPASS.
  43
  44 ===-------------------------------------------------------------------------===
  45
  46 * Codegen this:
  47
  48    void test2(int X) {
  49      if (X == 0x12345678) bar();
  50    }
  51
  52     as:
  53
  54        xoris r0,r3,0x1234
  55        cmplwi cr0,r0,0x5678
  56        beq cr0,L6
  57
  58     not:
  59
  60         lis r2, 4660
  61         ori r2, r2, 22136
  62         cmpw cr0, r3, r2
  63         bne .LBB_test2_2
  64
  65 ===-------------------------------------------------------------------------===
  66
  67 Lump the constant pool for each function into ONE pic object, and reference
  68 pieces of it as offsets from the start.  For functions like this (contrived
  69 to have lots of constants obviously):
  70
  71 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  72
  73 We generate:
  74
  75 _X:
  76         lis r2, ha16(.CPI_X_0)
  77         lfd f0, lo16(.CPI_X_0)(r2)
  78         lis r2, ha16(.CPI_X_1)
  79         lfd f2, lo16(.CPI_X_1)(r2)
  80         fmadd f0, f1, f0, f2
  81         lis r2, ha16(.CPI_X_2)
  82         lfd f1, lo16(.CPI_X_2)(r2)
  83         lis r2, ha16(.CPI_X_3)
  84         lfd f2, lo16(.CPI_X_3)(r2)
  85         fmadd f1, f0, f1, f2
  86         blr
  87
  88 It would be better to materialize .CPI_X into a register, then use immediates
  89 off of the register to avoid the lis's.  This is even more important in PIC
  90 mode.
  91
  92 Note that this (and the static variable version) is discussed here for GCC:
  93 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  94
  95 ===-------------------------------------------------------------------------===
  96
  97 PIC Code Gen IPO optimization:
  98
  99 Squish small scalar globals together into a single global struct, allowing the
 100 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 101 of the GOT on targets with one).
 102
 103 Note that this is discussed here for GCC:
 104 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 105
 106 ===-------------------------------------------------------------------------===
 107
 108 Implement Newton-Rhapson method for improving estimate instructions to the
 109 correct accuracy, and implementing divide as multiply by reciprocal when it has
 110 more than one use.  Itanium will want this too.
 111
 112 ===-------------------------------------------------------------------------===
 113
 114 #define  ARRAY_LENGTH  16
 115
 116 union bitfield {
 117         struct {
 118 #ifndef __ppc__
 119                 unsigned int                       field0 : 6;
 120                 unsigned int                       field1 : 6;
 121                 unsigned int                       field2 : 6;
 122                 unsigned int                       field3 : 6;
 123                 unsigned int                       field4 : 3;
 124                 unsigned int                       field5 : 4;
 125                 unsigned int                       field6 : 1;
 126 #else
 127                 unsigned int                       field6 : 1;
 128                 unsigned int                       field5 : 4;
 129                 unsigned int                       field4 : 3;
 130                 unsigned int                       field3 : 6;
 131                 unsigned int                       field2 : 6;
 132                 unsigned int                       field1 : 6;
 133                 unsigned int                       field0 : 6;
 134 #endif
 135         } bitfields, bits;
 136         unsigned int    u32All;
 137         signed int      i32All;
 138         float   f32All;
 139 };
 140
 141
 142 typedef struct program_t {
 143         union bitfield    array[ARRAY_LENGTH];
 144     int               size;
 145     int               loaded;
 146 } program;
 147
 148
 149 void AdjustBitfields(program* prog, unsigned int fmt1)
 150 {
 151         prog->array[0].bitfields.field0 = fmt1;
 152         prog->array[0].bitfields.field1 = fmt1 + 1;
 153 }
 154
 155 We currently generate:
 156
 157 _AdjustBitfields:
 158         lwz r2, 0(r3)
 159         addi r5, r4, 1
 160         rlwinm r2, r2, 0, 0, 19
 161         rlwinm r5, r5, 6, 20, 25
 162         rlwimi r2, r4, 0, 26, 31
 163         or r2, r2, r5
 164         stw r2, 0(r3)
 165         blr
 166
 167 We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
 168 turned into rlwimi (rlwimi)
 169
 170 The better codegen would be:
 171
 172 _AdjustBitfields:
 173         lwz r0,0(r3)
 174         rlwinm r4,r4,0,0xff
 175         rlwimi r0,r4,0,26,31
 176         addi r4,r4,1
 177         rlwimi r0,r4,6,20,25
 178         stw r0,0(r3)
 179         blr
 180
 181 ===-------------------------------------------------------------------------===
 182
 183 Compile this:
 184
 185 int %f1(int %a, int %b) {
 186         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 187         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 188         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 189         ret int %tmp.4
 190 }
 191
 192 without a copy.  We make this currently:
 193
 194 _f1:
 195         rlwinm r2, r4, 0, 24, 27
 196         rlwimi r2, r3, 0, 28, 31
 197         or r3, r2, r2
 198         blr
 199
 200 The two-addr pass or RA needs to learn when it is profitable to commute an
 201 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 202 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 203
 204 ===-------------------------------------------------------------------------===
 205
 206 Compile offsets from allocas:
 207
 208 int *%test() {
 209         %X = alloca { int, int }
 210         %Y = getelementptr {int,int}* %X, int 0, uint 1
 211         ret int* %Y
 212 }
 213
 214 into a single add, not two:
 215
 216 _test:
 217         addi r2, r1, -8
 218         addi r3, r2, 4
 219         blr
 220
 221 --> important for C++.
 222
 223 ===-------------------------------------------------------------------------===
 224
 225 int test3(int a, int b) { return (a < 0) ? a : 0; }
 226
 227 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 228
 229 ===-------------------------------------------------------------------------===
 230
 231 No loads or stores of the constants should be needed:
 232
 233 struct foo { double X, Y; };
 234 void xxx(struct foo F);
 235 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 236
 237 ===-------------------------------------------------------------------------===
 238
 239 Darwin Stub LICM optimization:
 240
 241 Loops like this:
 242
 243   for (...)  bar();
 244
 245 Have to go through an indirect stub if bar is external or linkonce.  It would
 246 be better to compile it as:
 247
 248      fp = &bar;
 249      for (...)  fp();
 250
 251 which only computes the address of bar once (instead of each time through the
 252 stub).  This is Darwin specific and would have to be done in the code generator.
 253 Probably not a win on x86.
 254
 255 ===-------------------------------------------------------------------------===
 256
 257 PowerPC i1/setcc stuff (depends on subreg stuff):
 258
 259 Check out the PPC code we get for 'compare' in this testcase:
 260 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 261
 262 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 263 invert, invert, or), we then have to compare it against zero instead of
 264 using the value already in a CR!
 265
 266 that should be something like
 267         cmpw cr7, r8, r5
 268         cmpw cr0, r7, r3
 269         crnand cr0, cr0, cr7
 270         bne cr0, LBB_compare_4
 271
 272 instead of
 273         cmpw cr7, r8, r5
 274         cmpw cr0, r7, r3
 275         mfcr r7, 1
 276         mcrf cr7, cr0
 277         mfcr r8, 1
 278         rlwinm r7, r7, 30, 31, 31
 279         rlwinm r8, r8, 30, 31, 31
 280         xori r7, r7, 1
 281         xori r8, r8, 1
 282         addi r2, r2, 1
 283         or r7, r8, r7
 284         cmpwi cr0, r7, 0
 285         bne cr0, LBB_compare_4  ; loopexit
 286
 287 FreeBench/mason has a basic block that looks like this:
 288
 289          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 290          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 291          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 292          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 293          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 294          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 295          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 296          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 297          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 298          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 299          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 300          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 301
 302 This is a particularly important case where handling CRs better will help.
 303
 304 ===-------------------------------------------------------------------------===
 305
 306 Simple IPO for argument passing, change:
 307   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 308
 309 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 310 of arguments get assigned to r3 through r10. That is, if you have a function
 311 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 312 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 313 order for functions we can internalize so that the maximum number of
 314 integers/pointers get passed in regs before you see any of the fp arguments.
 315
 316 Instead of implementing this, it would actually probably be easier to just
 317 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 318 including having this work sanely.
 319
 320 ===-------------------------------------------------------------------------===
 321
 322 Fix Darwin FP-In-Integer Registers ABI
 323
 324 Darwin passes doubles in structures in integer registers, which is very very
 325 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 326 that percolates these things out of functions.
 327
 328 Check out how horrible this is:
 329 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 330
 331 This is an extension of "interprocedural CC unmunging" that can't be done with
 332 just fastcc.
 333
 334 ===-------------------------------------------------------------------------===
 335
 336 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 337
 338 ===-------------------------------------------------------------------------===
 339
 340 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 341 TargetConstantVec's if it's one of the many forms that are algorithmically
 342 computable using the spiffy altivec instructions.
 343
 344 ===-------------------------------------------------------------------------===
 345
 346 Compile this:
 347
 348 double %test(double %X) {
 349         %Y = cast double %X to long
 350         %Z = cast long %Y to double
 351         ret double %Z
 352 }
 353
 354 to this:
 355
 356 _test:
 357         fctidz f0, f1
 358         stfd f0, -8(r1)
 359         lwz r2, -4(r1)
 360         lwz r3, -8(r1)
 361         stw r2, -12(r1)
 362         stw r3, -16(r1)
 363         lfd f0, -16(r1)
 364         fcfid f1, f0
 365         blr
 366
 367 without the lwz/stw's.
 368
 369 ===-------------------------------------------------------------------------===
 370
 371 Compile this:
 372
 373 int foo(int a) {
 374   int b = (a < 8);
 375   if (b) {
 376     return b * 3;     // ignore the fact that this is always 3.
 377   } else {
 378     return 2;
 379   }
 380 }
 381
 382 into something not this:
 383
 384 _foo:
 385 1)      cmpwi cr7, r3, 8
 386         mfcr r2, 1
 387         rlwinm r2, r2, 29, 31, 31
 388 1)      cmpwi cr0, r3, 7
 389         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 390 LBB1_1: ; then
 391         rlwinm r2, r2, 0, 31, 31
 392         mulli r3, r2, 3
 393         blr
 394 LBB1_2: ; UnifiedReturnBlock
 395         li r3, 2
 396         blr
 397
 398 In particular, the two compares (marked 1) could be shared by reversing one.
 399 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 400 same operands (but backwards) exists.  In this case, this wouldn't save us
 401 anything though, because the compares still wouldn't be shared.
 402
 403 ===-------------------------------------------------------------------------===
 404
 405 The legalizer should lower this:
 406
 407 bool %test(ulong %x) {
 408   %tmp = setlt ulong %x, 4294967296
 409   ret bool %tmp
 410 }
 411
 412 into "if x.high == 0", not:
 413
 414 _test:
 415         addi r2, r3, -1
 416         cntlzw r2, r2
 417         cntlzw r3, r3
 418         srwi r2, r2, 5
 419         srwi r4, r3, 5
 420         li r3, 0
 421         cmpwi cr0, r2, 0
 422         bne cr0, LBB1_2 ;
 423 LBB1_1:
 424         or r3, r4, r4
 425 LBB1_2:
 426         blr
 427
 428 noticed in 2005-05-11-Popcount-ffs-fls.c.
 429
 430
 431 ===-------------------------------------------------------------------------===
 432
 433 We should custom expand setcc instead of pretending that we have it.  That
 434 would allow us to expose the access of the crbit after the mfcr, allowing
 435 that access to be trivially folded into other ops.  A simple example:
 436
 437 int foo(int a, int b) { return (a < b) << 4; }
 438
 439 compiles into:
 440
 441 _foo:
 442         cmpw cr7, r3, r4
 443         mfcr r2, 1
 444         rlwinm r2, r2, 29, 31, 31
 445         slwi r3, r2, 4
 446         blr
 447
 448 ===-------------------------------------------------------------------------===
 449
 450 Fold add and sub with constant into non-extern, non-weak addresses so this:
 451
 452 static int a;
 453 void bar(int b) { a = b; }
 454 void foo(unsigned char *c) {
 455   *c = a;
 456 }
 457
 458 So that
 459
 460 _foo:
 461         lis r2, ha16(_a)
 462         la r2, lo16(_a)(r2)
 463         lbz r2, 3(r2)
 464         stb r2, 0(r3)
 465         blr
 466
 467 Becomes
 468
 469 _foo:
 470         lis r2, ha16(_a+3)
 471         lbz r2, lo16(_a+3)(r2)
 472         stb r2, 0(r3)
 473         blr
 474
 475 ===-------------------------------------------------------------------------===
 476
 477 We generate really bad code for this:
 478
 479 int f(signed char *a, _Bool b, _Bool c) {
 480    signed char t = 0;
 481   if (b)  t = *a;
 482   if (c)  *a = t;
 483 }
 484