[ppc] Distinguish the 'es', 'o', 'm', 'Q', 'Z', and 'Zy' inline assembly memory const...

[oota-llvm.git] / lib / Target / PowerPC / README.txt
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt

index 1341c66f2668a8bcb3df16fe014db8b1d2b3c45a..dfe988fc5cde548682675c460b728f432f921839 100644 (file)
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -1,43 +1,33 @@
  //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
  
  TODO:
-* gpr0 allocation
-* implement do-loop -> bdnz transform
  * lmw/stmw pass a la arm load store optimizer for prolog/epilog
  
  ===-------------------------------------------------------------------------===
  
-Support 'update' load/store instructions.  These are cracked on the G5, but are
-still a codesize win.
+This code:
  
-With preinc enabled, this:
-
-long *%test4(long *%X, long *%dest) {
-        %Y = getelementptr long* %X, int 4
-        %A = load long* %Y
-        store long %A, long* %dest
-        ret long* %Y
+unsigned add32carry(unsigned sum, unsigned x) {
+ unsigned z = sum + x;
+ if (sum + x < x)
+     z++;
+ return z;
  }
  
-compiles to:
+Should compile to something like:
  
-_test4:
-        mr r2, r3
-        lwzu r5, 32(r2)
-        lwz r3, 36(r3)
-        stw r5, 0(r4)
-        stw r3, 4(r4)
-        mr r3, r2
-        blr 
+       addc r3,r3,r4
+       addze r3,r3
  
-with -sched=list-burr, I get:
+instead we get:
  
-_test4:
-        lwz r2, 36(r3)
-        lwzu r5, 32(r3)
-        stw r2, 4(r4)
-        stw r5, 0(r4)
-        blr 
+       add r3, r4, r3
+       cmplw cr7, r3, r4
+       mfcr r4 ; 1
+       rlwinm r4, r4, 29, 31, 31
+       add r3, r3, r4
+
+Ick.
  
  ===-------------------------------------------------------------------------===
  
@@ -70,25 +60,6 @@ produced this with bdnz, the loop would be a single dispatch group.
  
  ===-------------------------------------------------------------------------===
  
-Compile:
-
-void foo(int *P) {
- if (P)  *P = 0;
-}
-
-into:
-
-_foo:
-        cmpwi cr0,r3,0
-        beqlr cr0
-        li r0,0
-        stw r0,0(r3)
-        blr
-
-This is effectively a simple form of predication.
-
-===-------------------------------------------------------------------------===
-
  Lump the constant pool for each function into ONE pic object, and reference
  pieces of it as offsets from the start.  For functions like this (contrived
  to have lots of constants obviously):
@@ -147,45 +118,13 @@ http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  
  ===-------------------------------------------------------------------------===
  
-Implement Newton-Rhapson method for improving estimate instructions to the
-correct accuracy, and implementing divide as multiply by reciprocal when it has
-more than one use.  Itanium will want this too.
-
-===-------------------------------------------------------------------------===
-
-Compile offsets from allocas:
-
-int *%test() {
-        %X = alloca { int, int }
-        %Y = getelementptr {int,int}* %X, int 0, uint 1
-        ret int* %Y
-}
-
-into a single add, not two:
-
-_test:
-        addi r2, r1, -8
-        addi r3, r2, 4
-        blr
-
---> important for C++.
-
-===-------------------------------------------------------------------------===
-
-No loads or stores of the constants should be needed:
-
-struct foo { double X, Y; };
-void xxx(struct foo F);
-void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
-
-===-------------------------------------------------------------------------===
-
  Darwin Stub removal:
  
  We still generate calls to foo$stub, and stubs, on Darwin.  This is not
-necessary on Leopard (10.5) or later, as stubs are generated by ld when
-necessary.  The choice should depend on the value of -mmacosx-version-min.
-x86-32 does this right, see its logic.
+necessary when building with the Leopard (10.5) or later linker, as stubs are
+generated by ld when necessary.  Parameterizing this based on the deployment
+target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
+its logic.
  
  ===-------------------------------------------------------------------------===
  
@@ -205,8 +144,6 @@ which only computes the address of bar once (instead of each time through the
  stub).  This is Darwin specific and would have to be done in the code generator.
  Probably not a win on x86.
  
-Note that removing stubs altogether, as in the previous item, is better yet.
-
  ===-------------------------------------------------------------------------===
  
  Simple IPO for argument passing, change:
@@ -228,8 +165,8 @@ including having this work sanely.
  Fix Darwin FP-In-Integer Registers ABI
  
  Darwin passes doubles in structures in integer registers, which is very very 
-bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation 
-that percolates these things out of functions.
+bad.  Add something like a BITCAST to LLVM, then do an i-p transformation that
+percolates these things out of functions.
  
  Check out how horrible this is:
  http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
@@ -239,57 +176,6 @@ just fastcc.
  
  ===-------------------------------------------------------------------------===
  
-Compile this:
-
-int foo(int a) {
-  int b = (a < 8);
-  if (b) {
-    return b * 3;     // ignore the fact that this is always 3.
-  } else {
-    return 2;
-  }
-}
-
-into something not this:
-
-_foo:
-1)      cmpwi cr7, r3, 8
-        mfcr r2, 1
-        rlwinm r2, r2, 29, 31, 31
-1)      cmpwi cr0, r3, 7
-        bgt cr0, LBB1_2 ; UnifiedReturnBlock
-LBB1_1: ; then
-        rlwinm r2, r2, 0, 31, 31
-        mulli r3, r2, 3
-        blr
-LBB1_2: ; UnifiedReturnBlock
-        li r3, 2
-        blr
-
-In particular, the two compares (marked 1) could be shared by reversing one.
-This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
-same operands (but backwards) exists.  In this case, this wouldn't save us 
-anything though, because the compares still wouldn't be shared.
-
-===-------------------------------------------------------------------------===
-
-We should custom expand setcc instead of pretending that we have it.  That
-would allow us to expose the access of the crbit after the mfcr, allowing
-that access to be trivially folded into other ops.  A simple example:
-
-int foo(int a, int b) { return (a < b) << 4; }
-
-compiles into:
-
-_foo:
-        cmpw cr7, r3, r4
-        mfcr r2, 1
-        rlwinm r2, r2, 29, 31, 31
-        slwi r3, r2, 4
-        blr
-
-===-------------------------------------------------------------------------===
-
  Fold add and sub with constant into non-extern, non-weak addresses so this:
  
  static int a;
@@ -317,48 +203,6 @@ _foo:
  
  ===-------------------------------------------------------------------------===
  
-We generate really bad code for this:
-
-int f(signed char *a, _Bool b, _Bool c) {
-   signed char t = 0;
-  if (b)  t = *a;
-  if (c)  *a = t;
-}
-
-===-------------------------------------------------------------------------===
-
-This:
-int test(unsigned *P) { return *P >> 24; }
-
-Should compile to:
-
-_test:
-        lbz r3,0(r3)
-        blr
-
-not:
-
-_test:
-        lwz r2, 0(r3)
-        srwi r3, r2, 24
-        blr
-
-===-------------------------------------------------------------------------===
-
-On the G5, logical CR operations are more expensive in their three
-address form: ops that read/write the same register are half as expensive as
-those that read from two registers that are different from their destination.
-
-We should model this with two separate instructions.  The isel should generate
-the "two address" form of the instructions.  When the register allocator 
-detects that it needs to insert a copy due to the two-addresness of the CR
-logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
-we can convert to the "three address" instruction, to save code space.
-
-This only matters when we start generating cr logical ops.
-
-===-------------------------------------------------------------------------===
-
  We should compile these two functions to the same thing:
  
  #include <stdlib.h>
@@ -398,6 +242,35 @@ This theoretically may help improve twolf slightly (used in dimbox.c:142?).
  
  ===-------------------------------------------------------------------------===
  
+PR5945: This: 
+define i32 @clamp0g(i32 %a) {
+entry:
+        %cmp = icmp slt i32 %a, 0
+        %sel = select i1 %cmp, i32 0, i32 %a
+        ret i32 %sel
+}
+
+Is compile to this with the PowerPC (32-bit) backend:
+
+_clamp0g:
+        cmpwi cr0, r3, 0
+        li r2, 0
+        blt cr0, LBB1_2
+; BB#1:                                                     ; %entry
+        mr r2, r3
+LBB1_2:                                                     ; %entry
+        mr r3, r2
+        blr
+
+This could be reduced to the much simpler:
+
+_clamp0g:
+        srawi r2, r3, 31
+        andc r3, r3, r2
+        blr
+
+===-------------------------------------------------------------------------===
+
  int foo(int N, int ***W, int **TK, int X) {
    int t, i;
    
@@ -415,27 +288,6 @@ http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
  
  ===-------------------------------------------------------------------------===
  
-float foo(float X) { return (int)(X); }
-
-Currently produces:
-
-_foo:
-        fctiwz f0, f1
-        stfd f0, -8(r1)
-        lwz r2, -4(r1)
-        extsw r2, r2
-        std r2, -16(r1)
-        lfd f0, -16(r1)
-        fcfid f0, f0
-        frsp f1, f0
-        blr
-
-We could use a target dag combine to turn the lwz/extsw into an lwa when the 
-lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
-win only.
-
-===-------------------------------------------------------------------------===
-
  We generate ugly code for this:
  
  void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
@@ -451,20 +303,6 @@ void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
  
  ===-------------------------------------------------------------------------===
  
-Complete the signed i32 to FP conversion code using 64-bit registers
-transformation, good for PI.  See PPCISelLowering.cpp, this comment:
-
-     // FIXME: disable this lowered code.  This generates 64-bit register values,
-     // and we don't model the fact that the top part is clobbered by calls.  We
-     // need to flag these together so that the value isn't live across a call.
-     //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-
-Also, if the registers are spilled to the stack, we have to ensure that all
-64-bits of them are save/restored, otherwise we will miscompile the code.  It
-sounds like we need to get the 64-bit register classes going.
-
-===-------------------------------------------------------------------------===
-
  %struct.B = type { i8, [3 x i8] }
  
  define void @bar(%struct.B* %b) {
@@ -507,32 +345,6 @@ _foo:
  
  ===-------------------------------------------------------------------------===
  
-We compile:
-
-unsigned test6(unsigned x) { 
-  return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
-}
-
-into:
-
-_test6:
-        lis r2, 255
-        rlwinm r3, r3, 16, 0, 31
-        ori r2, r2, 255
-        and r3, r3, r2
-        blr
-
-GCC gets it down to:
-
-_test6:
-        rlwinm r0,r3,16,8,15
-        rlwinm r3,r3,16,24,31
-        or r3,r3,r0
-        blr
-
-
-===-------------------------------------------------------------------------===
-
  Consider a function like this:
  
  float foo(float X) { return X + 1234.4123f; }
@@ -603,48 +415,32 @@ This sort of thing occurs a lot due to globalopt.
  
  ===-------------------------------------------------------------------------===
  
-We currently compile 32-bit bswap:
+We compile:
  
-declare i32 @llvm.bswap.i32(i32 %A)
-define i32 @test(i32 %A) {
-        %B = call i32 @llvm.bswap.i32(i32 %A)
-        ret i32 %B
+define i32 @bar(i32 %x) nounwind readnone ssp {
+entry:
+  %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
+  %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
+  ret i32 %neg
  }
  
  to:
  
-_test:
-        rlwinm r2, r3, 24, 16, 23
-        slwi r4, r3, 24
-        rlwimi r2, r3, 8, 24, 31
-        rlwimi r4, r3, 8, 8, 15
-        rlwimi r4, r2, 0, 16, 31
-        mr r3, r4
-        blr 
+_bar:
+       cntlzw r2, r3
+       slwi r2, r2, 26
+       srawi r3, r2, 31
+       blr 
  
-it would be more efficient to produce:
+it would be better to produce:
  
-_foo:   mr r0,r3
-        rlwinm r3,r3,8,0xffffffff
-        rlwimi r3,r0,24,0,7
-        rlwimi r3,r0,24,16,23
+_bar: 
+        addic r3,r3,-1
+        subfe r3,r3,r3
          blr
  
  ===-------------------------------------------------------------------------===
  
-test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
-
-__ZNK4llvm5APInt17countLeadingZerosEv:
-        ld r2, 0(r3)
-        cntlzd r2, r2
-        or r2, r2, r2     <<-- silly.
-        addi r3, r2, -64
-        blr 
-
-The dead or is a 'truncate' from 64- to 32-bits.
-
-===-------------------------------------------------------------------------===
-
  We generate horrible ppc code for this:
  
  #define N  2000000
@@ -802,9 +598,33 @@ entry:
  ; recognize a more elaborate tree than a simple SETxx.
  
  define double @test_FNEG_sel(double %A, double %B, double %C) {
-        %D = sub double -0.000000e+00, %A               ; <double> [#uses=1]
+        %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
          %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
          %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
          ret double %E
  }
  
+//===----------------------------------------------------------------------===//
+The save/restore sequence for CR in prolog/epilog is terrible:
+- Each CR subreg is saved individually, rather than doing one save as a unit.
+- On Darwin, the save is done after the decrement of SP, which means the offset
+from SP of the save slot can be too big for a store instruction, which means we
+need an additional register (currently hacked in 96015+96020; the solution there
+is correct, but poor).
+- On SVR4 the same thing can happen, and I don't think saving before the SP
+decrement is safe on that target, as there is no red zone.  This is currently
+broken AFAIK, although it's not a target I can exercise.
+The following demonstrates the problem:
+extern void bar(char *p);
+void foo() {
+  char x[100000];
+  bar(x);
+  __asm__("" ::: "cr2");
+}
+
+//===----------------------------------------------------------------------===//
+
+Instruction fusion was introduced in ISA 2.06 and more opportunities added in
+ISA 2.07.  LLVM needs to add infrastructure to recognize fusion opportunities
+and force instruction pairs to be scheduled together.
+