lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Use the stfiwx instruction for:
   9
  10 void foo(float a, int *b) { *b = a; }
  11
  12 ===-------------------------------------------------------------------------===
  13
  14 unsigned short foo(float a) { return a; }
  15 should be:
  16 _foo:
  17         fctiwz f0,f1
  18         stfd f0,-8(r1)
  19         lhz r3,-2(r1)
  20         blr
  21 not:
  22 _foo:
  23         fctiwz f0, f1
  24         stfd f0, -8(r1)
  25         lwz r2, -4(r1)
  26         rlwinm r3, r2, 0, 16, 31
  27         blr
  28
  29 ===-------------------------------------------------------------------------===
  30
  31 Support 'update' load/store instructions.  These are cracked on the G5, but are
  32 still a codesize win.
  33
  34 ===-------------------------------------------------------------------------===
  35
  36 Should hint to the branch select pass that it doesn't need to print the second
  37 unconditional branch, so we don't end up with things like:
  38         b .LBBl42__2E_expand_function_8_674     ; loopentry.24
  39         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  40         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  41
  42 ===-------------------------------------------------------------------------===
  43
  44 * Codegen this:
  45
  46    void test2(int X) {
  47      if (X == 0x12345678) bar();
  48    }
  49
  50     as:
  51
  52        xoris r0,r3,0x1234
  53        cmpwi cr0,r0,0x5678
  54        beq cr0,L6
  55
  56     not:
  57
  58         lis r2, 4660
  59         ori r2, r2, 22136
  60         cmpw cr0, r3, r2
  61         bne .LBB_test2_2
  62
  63 ===-------------------------------------------------------------------------===
  64
  65 Lump the constant pool for each function into ONE pic object, and reference
  66 pieces of it as offsets from the start.  For functions like this (contrived
  67 to have lots of constants obviously):
  68
  69 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  70
  71 We generate:
  72
  73 _X:
  74         lis r2, ha16(.CPI_X_0)
  75         lfd f0, lo16(.CPI_X_0)(r2)
  76         lis r2, ha16(.CPI_X_1)
  77         lfd f2, lo16(.CPI_X_1)(r2)
  78         fmadd f0, f1, f0, f2
  79         lis r2, ha16(.CPI_X_2)
  80         lfd f1, lo16(.CPI_X_2)(r2)
  81         lis r2, ha16(.CPI_X_3)
  82         lfd f2, lo16(.CPI_X_3)(r2)
  83         fmadd f1, f0, f1, f2
  84         blr
  85
  86 It would be better to materialize .CPI_X into a register, then use immediates
  87 off of the register to avoid the lis's.  This is even more important in PIC
  88 mode.
  89
  90 Note that this (and the static variable version) is discussed here for GCC:
  91 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  92
  93 ===-------------------------------------------------------------------------===
  94
  95 Implement Newton-Rhapson method for improving estimate instructions to the
  96 correct accuracy, and implementing divide as multiply by reciprocal when it has
  97 more than one use.  Itanium will want this too.
  98
  99 ===-------------------------------------------------------------------------===
 100
 101 #define  ARRAY_LENGTH  16
 102
 103 union bitfield {
 104         struct {
 105 #ifndef __ppc__
 106                 unsigned int                       field0 : 6;
 107                 unsigned int                       field1 : 6;
 108                 unsigned int                       field2 : 6;
 109                 unsigned int                       field3 : 6;
 110                 unsigned int                       field4 : 3;
 111                 unsigned int                       field5 : 4;
 112                 unsigned int                       field6 : 1;
 113 #else
 114                 unsigned int                       field6 : 1;
 115                 unsigned int                       field5 : 4;
 116                 unsigned int                       field4 : 3;
 117                 unsigned int                       field3 : 6;
 118                 unsigned int                       field2 : 6;
 119                 unsigned int                       field1 : 6;
 120                 unsigned int                       field0 : 6;
 121 #endif
 122         } bitfields, bits;
 123         unsigned int    u32All;
 124         signed int      i32All;
 125         float   f32All;
 126 };
 127
 128
 129 typedef struct program_t {
 130         union bitfield    array[ARRAY_LENGTH];
 131     int               size;
 132     int               loaded;
 133 } program;
 134
 135
 136 void AdjustBitfields(program* prog, unsigned int fmt1)
 137 {
 138         unsigned int shift = 0;
 139         unsigned int texCount = 0;
 140         unsigned int i;
 141
 142         for (i = 0; i < 8; i++)
 143         {
 144                 prog->array[i].bitfields.field0 = texCount;
 145                 prog->array[i].bitfields.field1 = texCount + 1;
 146                 prog->array[i].bitfields.field2 = texCount + 2;
 147                 prog->array[i].bitfields.field3 = texCount + 3;
 148
 149                 texCount += (fmt1 >> shift) & 0x7;
 150                 shift    += 3;
 151         }
 152 }
 153
 154 In the loop above, the bitfield adds get generated as
 155 (add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
 156
 157 Since the input to the (or and, and) is an (add) rather than a (shl), the shift
 158 doesn't get folded into the rlwimi instruction.  We should ideally see through
 159 things like this, rather than forcing llvm to generate the equivalent
 160
 161 (shl (add bitfield, C2), C1) with some kind of mask.
 162
 163 ===-------------------------------------------------------------------------===
 164
 165 Compile this:
 166
 167 int %f1(int %a, int %b) {
 168         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 169         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 170         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 171         ret int %tmp.4
 172 }
 173
 174 without a copy.  We make this currently:
 175
 176 _f1:
 177         rlwinm r2, r4, 0, 24, 27
 178         rlwimi r2, r3, 0, 28, 31
 179         or r3, r2, r2
 180         blr
 181
 182 The two-addr pass or RA needs to learn when it is profitable to commute an
 183 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 184 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 185
 186 ===-------------------------------------------------------------------------===
 187
 188 176.gcc contains a bunch of code like this (this occurs dozens of times):
 189
 190 int %test(uint %mode.0.i.0) {
 191         %tmp.79 = cast uint %mode.0.i.0 to sbyte        ; <sbyte> [#uses=1]
 192         %tmp.80 = cast sbyte %tmp.79 to int             ; <int> [#uses=1]
 193         %tmp.81 = shl int %tmp.80, ubyte 16             ; <int> [#uses=1]
 194         %tmp.82 = and int %tmp.81, 16711680
 195         ret int %tmp.82
 196 }
 197
 198 which we compile to:
 199
 200 _test:
 201         extsb r2, r3
 202         rlwinm r3, r2, 16, 8, 15
 203         blr
 204
 205 The extsb is obviously dead.  This can be handled by a future thing like
 206 MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
 207 the sign bits are never used, so we can fold the sext_inreg to nothing).
 208
 209 I'm seeing code like this:
 210
 211         srwi r3, r3, 16
 212         extsb r3, r3
 213         rlwimi r4, r3, 16, 8, 15
 214
 215 in which the extsb is preventing the srwi from being nuked.
 216
 217 ===-------------------------------------------------------------------------===
 218
 219 Another example that occurs is:
 220
 221 uint %test(int %specbits.6.1) {
 222         %tmp.2540 = shr int %specbits.6.1, ubyte 11     ; <int> [#uses=1]
 223         %tmp.2541 = cast int %tmp.2540 to uint          ; <uint> [#uses=1]
 224         %tmp.2542 = shl uint %tmp.2541, ubyte 13        ; <uint> [#uses=1]
 225         %tmp.2543 = and uint %tmp.2542, 8192            ; <uint> [#uses=1]
 226         ret uint %tmp.2543
 227 }
 228
 229 which we codegen as:
 230
 231 l1_test:
 232         srawi r2, r3, 11
 233         rlwinm r3, r2, 13, 18, 18
 234         blr
 235
 236 the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
 237 dead), which I think can then be folded into the rlwinm.
 238
 239 ===-------------------------------------------------------------------------===
 240
 241 Compile offsets from allocas:
 242
 243 int *%test() {
 244         %X = alloca { int, int }
 245         %Y = getelementptr {int,int}* %X, int 0, uint 1
 246         ret int* %Y
 247 }
 248
 249 into a single add, not two:
 250
 251 _test:
 252         addi r2, r1, -8
 253         addi r3, r2, 4
 254         blr
 255
 256 --> important for C++.
 257
 258 ===-------------------------------------------------------------------------===
 259
 260 int test3(int a, int b) { return (a < 0) ? a : 0; }
 261
 262 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 263
 264 ===-------------------------------------------------------------------------===
 265
 266 No loads or stores of the constants should be needed:
 267
 268 struct foo { double X, Y; };
 269 void xxx(struct foo F);
 270 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 271
 272 ===-------------------------------------------------------------------------===
 273
 274 Darwin Stub LICM optimization:
 275
 276 Loops like this:
 277
 278   for (...)  bar();
 279
 280 Have to go through an indirect stub if bar is external or linkonce.  It would
 281 be better to compile it as:
 282
 283      fp = &bar;
 284      for (...)  fp();
 285
 286 which only computes the address of bar once (instead of each time through the
 287 stub).  This is Darwin specific and would have to be done in the code generator.
 288 Probably not a win on x86.
 289
 290 ===-------------------------------------------------------------------------===
 291
 292 PowerPC i1/setcc stuff (depends on subreg stuff):
 293
 294 Check out the PPC code we get for 'compare' in this testcase:
 295 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 296
 297 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 298 invert, invert, or), we then have to compare it against zero instead of
 299 using the value already in a CR!
 300
 301 that should be something like
 302         cmpw cr7, r8, r5
 303         cmpw cr0, r7, r3
 304         crnand cr0, cr0, cr7
 305         bne cr0, LBB_compare_4
 306
 307 instead of
 308         cmpw cr7, r8, r5
 309         cmpw cr0, r7, r3
 310         mfcr r7, 1
 311         mcrf cr7, cr0
 312         mfcr r8, 1
 313         rlwinm r7, r7, 30, 31, 31
 314         rlwinm r8, r8, 30, 31, 31
 315         xori r7, r7, 1
 316         xori r8, r8, 1
 317         addi r2, r2, 1
 318         or r7, r8, r7
 319         cmpwi cr0, r7, 0
 320         bne cr0, LBB_compare_4  ; loopexit
 321
 322 ===-------------------------------------------------------------------------===
 323
 324 Simple IPO for argument passing, change:
 325   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 326
 327 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 328 of arguments get assigned to r3 through r10. That is, if you have a function
 329 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 330 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 331 order for functions we can internalize so that the maximum number of
 332 integers/pointers get passed in regs before you see any of the fp arguments.
 333
 334 Instead of implementing this, it would actually probably be easier to just
 335 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 336 including having this work sanely.
 337
 338 ===-------------------------------------------------------------------------===
 339
 340 Fix Darwin FP-In-Integer Registers ABI
 341
 342 Darwin passes doubles in structures in integer registers, which is very very
 343 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 344 that percolates these things out of functions.
 345
 346 Check out how horrible this is:
 347 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 348
 349 This is an extension of "interprocedural CC unmunging" that can't be done with
 350 just fastcc.
 351
 352 ===-------------------------------------------------------------------------===
 353
 354 Code Gen IPO optimization:
 355
 356 Squish small scalar globals together into a single global struct, allowing the
 357 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 358 of the GOT on targets with one).
 359
 360 ===-------------------------------------------------------------------------===
 361
 362 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 363
 364 ===-------------------------------------------------------------------------===
 365
 366 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 367 TargetConstantVec's if it's one of the many forms that are algorithmically
 368 computable using the spiffy altivec instructions.
 369
 370 ===-------------------------------------------------------------------------===
 371
 372 Compile this:
 373
 374 double %test(double %X) {
 375         %Y = cast double %X to long
 376         %Z = cast long %Y to double
 377         ret double %Z
 378 }
 379
 380 to this:
 381
 382 _test:
 383         fctidz f0, f1
 384         stfd f0, -8(r1)
 385         lwz r2, -4(r1)
 386         lwz r3, -8(r1)
 387         stw r2, -12(r1)
 388         stw r3, -16(r1)
 389         lfd f0, -16(r1)
 390         fcfid f1, f0
 391         blr
 392
 393 without the lwz/stw's.
 394
 395 ===-------------------------------------------------------------------------===
 396
 397 Compile this:
 398
 399 int foo(int a) {
 400   int b = (a < 8);
 401   if (b) {
 402     return b * 3;     // ignore the fact that this is always 3.
 403   } else {
 404     return 2;
 405   }
 406 }
 407
 408 into something not this:
 409
 410 _foo:
 411 1)      cmpwi cr7, r3, 8
 412         mfcr r2, 1
 413         rlwinm r2, r2, 29, 31, 31
 414 1)      cmpwi cr0, r3, 7
 415         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 416 LBB1_1: ; then
 417         rlwinm r2, r2, 0, 31, 31
 418         mulli r3, r2, 3
 419         blr
 420 LBB1_2: ; UnifiedReturnBlock
 421         li r3, 2
 422         blr
 423
 424 In particular, the two compares (marked 1) could be shared by reversing one.
 425 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 426 same operands (but backwards) exists.  In this case, this wouldn't save us
 427 anything though, because the compares still wouldn't be shared.
 428
 429 ===-------------------------------------------------------------------------===
 430
 431 The legalizer should lower this:
 432
 433 bool %test(ulong %x) {
 434   %tmp = setlt ulong %x, 4294967296
 435   ret bool %tmp
 436 }
 437
 438 into "if x.high == 0", not:
 439
 440 _test:
 441         addi r2, r3, -1
 442         cntlzw r2, r2
 443         cntlzw r3, r3
 444         srwi r2, r2, 5
 445         srwi r4, r3, 5
 446         li r3, 0
 447         cmpwi cr0, r2, 0
 448         bne cr0, LBB1_2 ;
 449 LBB1_1:
 450         or r3, r4, r4
 451 LBB1_2:
 452         blr
 453
 454 noticed in 2005-05-11-Popcount-ffs-fls.c.
 455
 456
 457 ===-------------------------------------------------------------------------===
 458
 459 We should custom expand setcc instead of pretending that we have it.  That
 460 would allow us to expose the access of the crbit after the mfcr, allowing
 461 that access to be trivially folded into other ops.  A simple example:
 462
 463 int foo(int a, int b) { return (a < b) << 4; }
 464
 465 compiles into:
 466
 467 _foo:
 468         cmpw cr7, r3, r4
 469         mfcr r2, 1
 470         rlwinm r2, r2, 29, 31, 31
 471         slwi r3, r2, 4
 472         blr
 473
 474 ===-------------------------------------------------------------------------===
 475
 476 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
 477 precision don't matter (ffastmath).  Misc/mandel will like this. :)
 478
 479 ===-------------------------------------------------------------------------===
 480
 481 Fold add and sub with constant into non-extern, non-weak addresses so this:
 482
 483 static int a;
 484 void bar(int b) { a = b; }
 485 void foo(unsigned char *c) {
 486   *c = a;
 487 }
 488
 489 So that
 490
 491 _foo:
 492         lis r2, ha16(_a)
 493         la r2, lo16(_a)(r2)
 494         lbz r2, 3(r2)
 495         stb r2, 0(r3)
 496         blr
 497
 498 Becomes
 499
 500 _foo:
 501         lis r2, ha16(_a+3)
 502         lbz r2, lo16(_a+3)(r2)
 503         stb r2, 0(r3)
 504         blr