+//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
+
TODO:
* gpr0 allocation
* implement do-loop -> bdnz transform
* implement powerpc-64 for darwin
-* use stfiwx in float->int
-
-* Fold add and sub with constant into non-extern, non-weak addresses so this:
- lis r2, ha16(l2__ZTV4Cell)
- la r2, lo16(l2__ZTV4Cell)(r2)
- addi r2, r2, 8
-becomes:
- lis r2, ha16(l2__ZTV4Cell+8)
- la r2, lo16(l2__ZTV4Cell+8)(r2)
+===-------------------------------------------------------------------------===
-* Teach LLVM how to codegen this:
-unsigned short foo(float a) { return a; }
-as:
-_foo:
- fctiwz f0,f1
- stfd f0,-8(r1)
- lhz r3,-2(r1)
- blr
-not:
-_foo:
- fctiwz f0, f1
- stfd f0, -8(r1)
- lwz r2, -4(r1)
- rlwinm r3, r2, 0, 16, 31
- blr
+Support 'update' load/store instructions. These are cracked on the G5, but are
+still a codesize win.
-* Support 'update' load/store instructions. These are cracked on the G5, but
- are still a codesize win.
+===-------------------------------------------------------------------------===
-* should hint to the branch select pass that it doesn't need to print the
- second unconditional branch, so we don't end up with things like:
- b .LBBl42__2E_expand_function_8_674 ; loopentry.24
- b .LBBl42__2E_expand_function_8_42 ; NewDefault
- b .LBBl42__2E_expand_function_8_42 ; NewDefault
+Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
+we don't have to always run the branch selector for small functions.
===-------------------------------------------------------------------------===
as:
xoris r0,r3,0x1234
- cmpwi cr0,r0,0x5678
+ cmplwi cr0,r0,0x5678
beq cr0,L6
not:
===-------------------------------------------------------------------------===
+PIC Code Gen IPO optimization:
+
+Squish small scalar globals together into a single global struct, allowing the
+address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
+of the GOT on targets with one).
+
+Note that this is discussed here for GCC:
+http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
+
+===-------------------------------------------------------------------------===
+
Implement Newton-Rhapson method for improving estimate instructions to the
correct accuracy, and implementing divide as multiply by reciprocal when it has
more than one use. Itanium will want this too.
void AdjustBitfields(program* prog, unsigned int fmt1)
{
- unsigned int shift = 0;
- unsigned int texCount = 0;
- unsigned int i;
-
- for (i = 0; i < 8; i++)
- {
- prog->array[i].bitfields.field0 = texCount;
- prog->array[i].bitfields.field1 = texCount + 1;
- prog->array[i].bitfields.field2 = texCount + 2;
- prog->array[i].bitfields.field3 = texCount + 3;
-
- texCount += (fmt1 >> shift) & 0x7;
- shift += 3;
- }
+ prog->array[0].bitfields.field0 = fmt1;
+ prog->array[0].bitfields.field1 = fmt1 + 1;
}
-In the loop above, the bitfield adds get generated as
-(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
+We currently generate:
+
+_AdjustBitfields:
+ lwz r2, 0(r3)
+ addi r5, r4, 1
+ rlwinm r2, r2, 0, 0, 19
+ rlwinm r5, r5, 6, 20, 25
+ rlwimi r2, r4, 0, 26, 31
+ or r2, r2, r5
+ stw r2, 0(r3)
+ blr
-Since the input to the (or and, and) is an (add) rather than a (shl), the shift
-doesn't get folded into the rlwimi instruction. We should ideally see through
-things like this, rather than forcing llvm to generate the equivalent
+We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
+turned into rlwimi (rlwimi)
-(shl (add bitfield, C2), C1) with some kind of mask.
+The better codegen would be:
+
+_AdjustBitfields:
+ lwz r0,0(r3)
+ rlwinm r4,r4,0,0xff
+ rlwimi r0,r4,0,26,31
+ addi r4,r4,1
+ rlwimi r0,r4,6,20,25
+ stw r0,0(r3)
+ blr
===-------------------------------------------------------------------------===
===-------------------------------------------------------------------------===
-176.gcc contains a bunch of code like this (this occurs dozens of times):
-
-int %test(uint %mode.0.i.0) {
- %tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
- %tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
- %tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
- %tmp.82 = and int %tmp.81, 16711680
- ret int %tmp.82
-}
-
-which we compile to:
-
-_test:
- extsb r2, r3
- rlwinm r3, r2, 16, 8, 15
- blr
-
-The extsb is obviously dead. This can be handled by a future thing like
-MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
-the sign bits are never used, so we can fold the sext_inreg to nothing).
-
-I'm seeing code like this:
-
- srwi r3, r3, 16
- extsb r3, r3
- rlwimi r4, r3, 16, 8, 15
-
-in which the extsb is preventing the srwi from being nuked.
-
-===-------------------------------------------------------------------------===
-
-Another example that occurs is:
-
-uint %test(int %specbits.6.1) {
- %tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
- %tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
- %tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
- %tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
- ret uint %tmp.2543
-}
-
-which we codegen as:
-
-l1_test:
- srawi r2, r3, 11
- rlwinm r3, r2, 13, 18, 18
- blr
-
-the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
-dead), which I think can then be folded into the rlwinm.
-
-===-------------------------------------------------------------------------===
-
Compile offsets from allocas:
int *%test() {
cmpwi cr0, r7, 0
bne cr0, LBB_compare_4 ; loopexit
+FreeBench/mason has a basic block that looks like this:
+
+ %tmp.130 = seteq int %p.0__, 5 ; <bool> [#uses=1]
+ %tmp.134 = seteq int %p.1__, 6 ; <bool> [#uses=1]
+ %tmp.139 = seteq int %p.2__, 12 ; <bool> [#uses=1]
+ %tmp.144 = seteq int %p.3__, 13 ; <bool> [#uses=1]
+ %tmp.149 = seteq int %p.4__, 14 ; <bool> [#uses=1]
+ %tmp.154 = seteq int %p.5__, 15 ; <bool> [#uses=1]
+ %bothcond = and bool %tmp.134, %tmp.130 ; <bool> [#uses=1]
+ %bothcond123 = and bool %bothcond, %tmp.139 ; <bool>
+ %bothcond124 = and bool %bothcond123, %tmp.144 ; <bool>
+ %bothcond125 = and bool %bothcond124, %tmp.149 ; <bool>
+ %bothcond126 = and bool %bothcond125, %tmp.154 ; <bool>
+ br bool %bothcond126, label %shortcirc_next.5, label %else.0
+
+This is a particularly important case where handling CRs better will help.
+
===-------------------------------------------------------------------------===
Simple IPO for argument passing, change:
===-------------------------------------------------------------------------===
-Code Gen IPO optimization:
-
-Squish small scalar globals together into a single global struct, allowing the
-address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
-of the GOT on targets with one).
-
-===-------------------------------------------------------------------------===
-
Generate lwbrx and other byteswapping load/store instructions when reasonable.
===-------------------------------------------------------------------------===
-Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
-TargetConstantVec's if it's one of the many forms that are algorithmically
-computable using the spiffy altivec instructions.
-
-===-------------------------------------------------------------------------===
-
-Compile this:
-
-double %test(double %X) {
- %Y = cast double %X to long
- %Z = cast long %Y to double
- ret double %Z
-}
-
-to this:
-
-_test:
- fctidz f0, f1
- stfd f0, -8(r1)
- lwz r2, -4(r1)
- lwz r3, -8(r1)
- stw r2, -12(r1)
- stw r3, -16(r1)
- lfd f0, -16(r1)
- fcfid f1, f0
- blr
-
-without the lwz/stw's.
-
-===-------------------------------------------------------------------------===
-
Compile this:
int foo(int a) {
===-------------------------------------------------------------------------===
-Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
-precision don't matter (ffastmath). Misc/mandel will like this. :)
+Fold add and sub with constant into non-extern, non-weak addresses so this:
+
+static int a;
+void bar(int b) { a = b; }
+void foo(unsigned char *c) {
+ *c = a;
+}
+
+So that
+
+_foo:
+ lis r2, ha16(_a)
+ la r2, lo16(_a)(r2)
+ lbz r2, 3(r2)
+ stb r2, 0(r3)
+ blr
+
+Becomes
+
+_foo:
+ lis r2, ha16(_a+3)
+ lbz r2, lo16(_a+3)(r2)
+ stb r2, 0(r3)
+ blr
+
+===-------------------------------------------------------------------------===
+
+We generate really bad code for this:
+
+int f(signed char *a, _Bool b, _Bool c) {
+ signed char t = 0;
+ if (b) t = *a;
+ if (c) *a = t;
+}
+
+===-------------------------------------------------------------------------===
+
+This:
+int test(unsigned *P) { return *P >> 24; }
+
+Should compile to:
+
+_test:
+ lbz r3,0(r3)
+ blr
+
+not:
+
+_test:
+ lwz r2, 0(r3)
+ srwi r3, r2, 24
+ blr
+
+===-------------------------------------------------------------------------===
+
+On the G5, logical CR operations are more expensive in their three
+address form: ops that read/write the same register are half as expensive as
+those that read from two registers that are different from their destination.
+
+We should model this with two separate instructions. The isel should generate
+the "two address" form of the instructions. When the register allocator
+detects that it needs to insert a copy due to the two-addresness of the CR
+logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
+we can convert to the "three address" instruction, to save code space.
+
+This only matters when we start generating cr logical ops.
+
+===-------------------------------------------------------------------------===
+
+We should compile these two functions to the same thing:
+
+#include <stdlib.h>
+void f(int a, int b, int *P) {
+ *P = (a-b)>=0?(a-b):(b-a);
+}
+void g(int a, int b, int *P) {
+ *P = abs(a-b);
+}
+
+Further, they should compile to something better than:
+
+_g:
+ subf r2, r4, r3
+ subfic r3, r2, 0
+ cmpwi cr0, r2, -1
+ bgt cr0, LBB2_2 ; entry
+LBB2_1: ; entry
+ mr r2, r3
+LBB2_2: ; entry
+ stw r2, 0(r5)
+ blr
+
+GCC produces:
+
+_g:
+ subf r4,r4,r3
+ srawi r2,r4,31
+ xor r0,r2,r4
+ subf r0,r2,r0
+ stw r0,0(r5)
+ blr
+
+... which is much nicer.
+
+This theoretically may help improve twolf slightly (used in dimbox.c:142?).
+
+===-------------------------------------------------------------------------===
+
+int foo(int N, int ***W, int **TK, int X) {
+ int t, i;
+
+ for (t = 0; t < N; ++t)
+ for (i = 0; i < 4; ++i)
+ W[t / X][i][t % X] = TK[i][t];
+
+ return 5;
+}
+
+We generate relatively atrocious code for this loop compared to gcc.
+
+We could also strength reduce the rem and the div:
+http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
+
+===-------------------------------------------------------------------------===
+
+float foo(float X) { return (int)(X); }
+
+Currently produces:
+
+_foo:
+ fctiwz f0, f1
+ stfd f0, -8(r1)
+ lwz r2, -4(r1)
+ extsw r2, r2
+ std r2, -16(r1)
+ lfd f0, -16(r1)
+ fcfid f0, f0
+ frsp f1, f0
+ blr
+
+We could use a target dag combine to turn the lwz/extsw into an lwa when the
+lwz has a single use. Since LWA is cracked anyway, this would be a codesize
+win only.
+
+===-------------------------------------------------------------------------===
+
+We generate ugly code for this:
+
+void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
+ unsigned code = 0;
+ if(dx < -dw) code |= 1;
+ if(dx > dw) code |= 2;
+ if(dy < -dw) code |= 4;
+ if(dy > dw) code |= 8;
+ if(dz < -dw) code |= 16;
+ if(dz > dw) code |= 32;
+ *ret = code;
+}
+
+===-------------------------------------------------------------------------===