lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Use the stfiwx instruction for:
   9
  10 void foo(float a, int *b) { *b = a; }
  11
  12 ===-------------------------------------------------------------------------===
  13
  14 unsigned short foo(float a) { return a; }
  15 should be:
  16 _foo:
  17         fctiwz f0,f1
  18         stfd f0,-8(r1)
  19         lhz r3,-2(r1)
  20         blr
  21 not:
  22 _foo:
  23         fctiwz f0, f1
  24         stfd f0, -8(r1)
  25         lwz r2, -4(r1)
  26         rlwinm r3, r2, 0, 16, 31
  27         blr
  28
  29 ===-------------------------------------------------------------------------===
  30
  31 Support 'update' load/store instructions.  These are cracked on the G5, but are
  32 still a codesize win.
  33
  34 ===-------------------------------------------------------------------------===
  35
  36 Should hint to the branch select pass that it doesn't need to print the second
  37 unconditional branch, so we don't end up with things like:
  38         b .LBBl42__2E_expand_function_8_674     ; loopentry.24
  39         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  40         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  41
  42 This occurs in SPASS.
  43
  44 ===-------------------------------------------------------------------------===
  45
  46 * Codegen this:
  47
  48    void test2(int X) {
  49      if (X == 0x12345678) bar();
  50    }
  51
  52     as:
  53
  54        xoris r0,r3,0x1234
  55        cmpwi cr0,r0,0x5678
  56        beq cr0,L6
  57
  58     not:
  59
  60         lis r2, 4660
  61         ori r2, r2, 22136
  62         cmpw cr0, r3, r2
  63         bne .LBB_test2_2
  64
  65 ===-------------------------------------------------------------------------===
  66
  67 Lump the constant pool for each function into ONE pic object, and reference
  68 pieces of it as offsets from the start.  For functions like this (contrived
  69 to have lots of constants obviously):
  70
  71 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  72
  73 We generate:
  74
  75 _X:
  76         lis r2, ha16(.CPI_X_0)
  77         lfd f0, lo16(.CPI_X_0)(r2)
  78         lis r2, ha16(.CPI_X_1)
  79         lfd f2, lo16(.CPI_X_1)(r2)
  80         fmadd f0, f1, f0, f2
  81         lis r2, ha16(.CPI_X_2)
  82         lfd f1, lo16(.CPI_X_2)(r2)
  83         lis r2, ha16(.CPI_X_3)
  84         lfd f2, lo16(.CPI_X_3)(r2)
  85         fmadd f1, f0, f1, f2
  86         blr
  87
  88 It would be better to materialize .CPI_X into a register, then use immediates
  89 off of the register to avoid the lis's.  This is even more important in PIC
  90 mode.
  91
  92 Note that this (and the static variable version) is discussed here for GCC:
  93 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  94
  95 ===-------------------------------------------------------------------------===
  96
  97 PIC Code Gen IPO optimization:
  98
  99 Squish small scalar globals together into a single global struct, allowing the
 100 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 101 of the GOT on targets with one).
 102
 103 Note that this is discussed here for GCC:
 104 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 105
 106 ===-------------------------------------------------------------------------===
 107
 108 Implement Newton-Rhapson method for improving estimate instructions to the
 109 correct accuracy, and implementing divide as multiply by reciprocal when it has
 110 more than one use.  Itanium will want this too.
 111
 112 ===-------------------------------------------------------------------------===
 113
 114 #define  ARRAY_LENGTH  16
 115
 116 union bitfield {
 117         struct {
 118 #ifndef __ppc__
 119                 unsigned int                       field0 : 6;
 120                 unsigned int                       field1 : 6;
 121                 unsigned int                       field2 : 6;
 122                 unsigned int                       field3 : 6;
 123                 unsigned int                       field4 : 3;
 124                 unsigned int                       field5 : 4;
 125                 unsigned int                       field6 : 1;
 126 #else
 127                 unsigned int                       field6 : 1;
 128                 unsigned int                       field5 : 4;
 129                 unsigned int                       field4 : 3;
 130                 unsigned int                       field3 : 6;
 131                 unsigned int                       field2 : 6;
 132                 unsigned int                       field1 : 6;
 133                 unsigned int                       field0 : 6;
 134 #endif
 135         } bitfields, bits;
 136         unsigned int    u32All;
 137         signed int      i32All;
 138         float   f32All;
 139 };
 140
 141
 142 typedef struct program_t {
 143         union bitfield    array[ARRAY_LENGTH];
 144     int               size;
 145     int               loaded;
 146 } program;
 147
 148
 149 void AdjustBitfields(program* prog, unsigned int fmt1)
 150 {
 151         unsigned int shift = 0;
 152         unsigned int texCount = 0;
 153         unsigned int i;
 154
 155         for (i = 0; i < 8; i++)
 156         {
 157                 prog->array[i].bitfields.field0 = texCount;
 158                 prog->array[i].bitfields.field1 = texCount + 1;
 159                 prog->array[i].bitfields.field2 = texCount + 2;
 160                 prog->array[i].bitfields.field3 = texCount + 3;
 161
 162                 texCount += (fmt1 >> shift) & 0x7;
 163                 shift    += 3;
 164         }
 165 }
 166
 167 In the loop above, the bitfield adds get generated as
 168 (add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
 169
 170 Since the input to the (or and, and) is an (add) rather than a (shl), the shift
 171 doesn't get folded into the rlwimi instruction.  We should ideally see through
 172 things like this, rather than forcing llvm to generate the equivalent
 173
 174 (shl (add bitfield, C2), C1) with some kind of mask.
 175
 176 ===-------------------------------------------------------------------------===
 177
 178 Compile this:
 179
 180 int %f1(int %a, int %b) {
 181         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 182         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 183         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 184         ret int %tmp.4
 185 }
 186
 187 without a copy.  We make this currently:
 188
 189 _f1:
 190         rlwinm r2, r4, 0, 24, 27
 191         rlwimi r2, r3, 0, 28, 31
 192         or r3, r2, r2
 193         blr
 194
 195 The two-addr pass or RA needs to learn when it is profitable to commute an
 196 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 197 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 198
 199 ===-------------------------------------------------------------------------===
 200
 201 Compile offsets from allocas:
 202
 203 int *%test() {
 204         %X = alloca { int, int }
 205         %Y = getelementptr {int,int}* %X, int 0, uint 1
 206         ret int* %Y
 207 }
 208
 209 into a single add, not two:
 210
 211 _test:
 212         addi r2, r1, -8
 213         addi r3, r2, 4
 214         blr
 215
 216 --> important for C++.
 217
 218 ===-------------------------------------------------------------------------===
 219
 220 int test3(int a, int b) { return (a < 0) ? a : 0; }
 221
 222 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 223
 224 ===-------------------------------------------------------------------------===
 225
 226 No loads or stores of the constants should be needed:
 227
 228 struct foo { double X, Y; };
 229 void xxx(struct foo F);
 230 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 231
 232 ===-------------------------------------------------------------------------===
 233
 234 Darwin Stub LICM optimization:
 235
 236 Loops like this:
 237
 238   for (...)  bar();
 239
 240 Have to go through an indirect stub if bar is external or linkonce.  It would
 241 be better to compile it as:
 242
 243      fp = &bar;
 244      for (...)  fp();
 245
 246 which only computes the address of bar once (instead of each time through the
 247 stub).  This is Darwin specific and would have to be done in the code generator.
 248 Probably not a win on x86.
 249
 250 ===-------------------------------------------------------------------------===
 251
 252 PowerPC i1/setcc stuff (depends on subreg stuff):
 253
 254 Check out the PPC code we get for 'compare' in this testcase:
 255 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 256
 257 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 258 invert, invert, or), we then have to compare it against zero instead of
 259 using the value already in a CR!
 260
 261 that should be something like
 262         cmpw cr7, r8, r5
 263         cmpw cr0, r7, r3
 264         crnand cr0, cr0, cr7
 265         bne cr0, LBB_compare_4
 266
 267 instead of
 268         cmpw cr7, r8, r5
 269         cmpw cr0, r7, r3
 270         mfcr r7, 1
 271         mcrf cr7, cr0
 272         mfcr r8, 1
 273         rlwinm r7, r7, 30, 31, 31
 274         rlwinm r8, r8, 30, 31, 31
 275         xori r7, r7, 1
 276         xori r8, r8, 1
 277         addi r2, r2, 1
 278         or r7, r8, r7
 279         cmpwi cr0, r7, 0
 280         bne cr0, LBB_compare_4  ; loopexit
 281
 282 FreeBench/mason has a basic block that looks like this:
 283
 284          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 285          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 286          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 287          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 288          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 289          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 290          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 291          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 292          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 293          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 294          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 295          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 296
 297 This is a particularly important case where handling CRs better will help.
 298
 299 ===-------------------------------------------------------------------------===
 300
 301 Simple IPO for argument passing, change:
 302   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 303
 304 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 305 of arguments get assigned to r3 through r10. That is, if you have a function
 306 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 307 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 308 order for functions we can internalize so that the maximum number of
 309 integers/pointers get passed in regs before you see any of the fp arguments.
 310
 311 Instead of implementing this, it would actually probably be easier to just
 312 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 313 including having this work sanely.
 314
 315 ===-------------------------------------------------------------------------===
 316
 317 Fix Darwin FP-In-Integer Registers ABI
 318
 319 Darwin passes doubles in structures in integer registers, which is very very
 320 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 321 that percolates these things out of functions.
 322
 323 Check out how horrible this is:
 324 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 325
 326 This is an extension of "interprocedural CC unmunging" that can't be done with
 327 just fastcc.
 328
 329 ===-------------------------------------------------------------------------===
 330
 331 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 332
 333 ===-------------------------------------------------------------------------===
 334
 335 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 336 TargetConstantVec's if it's one of the many forms that are algorithmically
 337 computable using the spiffy altivec instructions.
 338
 339 ===-------------------------------------------------------------------------===
 340
 341 Compile this:
 342
 343 double %test(double %X) {
 344         %Y = cast double %X to long
 345         %Z = cast long %Y to double
 346         ret double %Z
 347 }
 348
 349 to this:
 350
 351 _test:
 352         fctidz f0, f1
 353         stfd f0, -8(r1)
 354         lwz r2, -4(r1)
 355         lwz r3, -8(r1)
 356         stw r2, -12(r1)
 357         stw r3, -16(r1)
 358         lfd f0, -16(r1)
 359         fcfid f1, f0
 360         blr
 361
 362 without the lwz/stw's.
 363
 364 ===-------------------------------------------------------------------------===
 365
 366 Compile this:
 367
 368 int foo(int a) {
 369   int b = (a < 8);
 370   if (b) {
 371     return b * 3;     // ignore the fact that this is always 3.
 372   } else {
 373     return 2;
 374   }
 375 }
 376
 377 into something not this:
 378
 379 _foo:
 380 1)      cmpwi cr7, r3, 8
 381         mfcr r2, 1
 382         rlwinm r2, r2, 29, 31, 31
 383 1)      cmpwi cr0, r3, 7
 384         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 385 LBB1_1: ; then
 386         rlwinm r2, r2, 0, 31, 31
 387         mulli r3, r2, 3
 388         blr
 389 LBB1_2: ; UnifiedReturnBlock
 390         li r3, 2
 391         blr
 392
 393 In particular, the two compares (marked 1) could be shared by reversing one.
 394 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 395 same operands (but backwards) exists.  In this case, this wouldn't save us
 396 anything though, because the compares still wouldn't be shared.
 397
 398 ===-------------------------------------------------------------------------===
 399
 400 The legalizer should lower this:
 401
 402 bool %test(ulong %x) {
 403   %tmp = setlt ulong %x, 4294967296
 404   ret bool %tmp
 405 }
 406
 407 into "if x.high == 0", not:
 408
 409 _test:
 410         addi r2, r3, -1
 411         cntlzw r2, r2
 412         cntlzw r3, r3
 413         srwi r2, r2, 5
 414         srwi r4, r3, 5
 415         li r3, 0
 416         cmpwi cr0, r2, 0
 417         bne cr0, LBB1_2 ;
 418 LBB1_1:
 419         or r3, r4, r4
 420 LBB1_2:
 421         blr
 422
 423 noticed in 2005-05-11-Popcount-ffs-fls.c.
 424
 425
 426 ===-------------------------------------------------------------------------===
 427
 428 We should custom expand setcc instead of pretending that we have it.  That
 429 would allow us to expose the access of the crbit after the mfcr, allowing
 430 that access to be trivially folded into other ops.  A simple example:
 431
 432 int foo(int a, int b) { return (a < b) << 4; }
 433
 434 compiles into:
 435
 436 _foo:
 437         cmpw cr7, r3, r4
 438         mfcr r2, 1
 439         rlwinm r2, r2, 29, 31, 31
 440         slwi r3, r2, 4
 441         blr
 442
 443 ===-------------------------------------------------------------------------===
 444
 445 Fold add and sub with constant into non-extern, non-weak addresses so this:
 446
 447 static int a;
 448 void bar(int b) { a = b; }
 449 void foo(unsigned char *c) {
 450   *c = a;
 451 }
 452
 453 So that
 454
 455 _foo:
 456         lis r2, ha16(_a)
 457         la r2, lo16(_a)(r2)
 458         lbz r2, 3(r2)
 459         stb r2, 0(r3)
 460         blr
 461
 462 Becomes
 463
 464 _foo:
 465         lis r2, ha16(_a+3)
 466         lbz r2, lo16(_a+3)(r2)
 467         stb r2, 0(r3)
 468         blr
 469
 470 ===-------------------------------------------------------------------------===
 471
 472 We generate really bad code for this:
 473
 474 int f(signed char *a, _Bool b, _Bool c) {
 475    signed char t = 0;
 476   if (b)  t = *a;
 477   if (c)  *a = t;
 478 }
 479