Use doxygen comment syntax.

[oota-llvm.git] / lib / Target / ARM / README.txt
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt

index 8f80b79fc3309040b1ea4916a8c966c1993f20b0..068c441ed737626486e58d3d0f8f8a98cbb7b240 100644 (file)
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -7,7 +7,6 @@ Reimplement 'select' in terms of 'SEL'.
  * We would really like to support UXTAB16, but we need to prove that the
    add doesn't need to overflow between the two 16-bit chunks.
  
-* implement predication support
  * Implement pre/post increment support.  (e.g. PR935)
  * Coalesce stack slots!
  * Implement smarter constant generation for binops with large immediates.
@@ -17,35 +16,39 @@ Reimplement 'select' in terms of 'SEL'.
  
  //===---------------------------------------------------------------------===//
  
-The constant island pass has been much improved; all the todo items in the
-previous version of this document have been addressed.  However, there are still
-things that can be done:
+Crazy idea:  Consider code that uses lots of 8-bit or 16-bit values.  By the
+time regalloc happens, these values are now in a 32-bit register, usually with
+the top-bits known to be sign or zero extended.  If spilled, we should be able
+to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part
+of the reload.
  
-1.  When there isn't an existing water, the current MBB is split right after 
-the use.  It would be profitable to look farther forward, especially on Thumb,
-where negative offsets won't work.
-Now it will put the island at the end of the block if that is in range.  If it
-is not in range things still work as above, which is poor on Thumb.
+Doing this reduces the size of the stack frame (important for thumb etc), and
+also increases the likelihood that we will be able to reload multiple values
+from the stack with a single load.
  
-2.  There may be some advantage to trying to be smarter about the initial
+//===---------------------------------------------------------------------===//
+
+The constant island pass is in good shape.  Some cleanups might be desirable,
+but there is unlikely to be much improvement in the generated code.
+
+1.  There may be some advantage to trying to be smarter about the initial
  placement, rather than putting everything at the end.
  
-3.  The handling of 2-byte padding for Thumb is overly conservative.  There 
-would be a small gain to keeping accurate track of the padding (which would
-require aligning functions containing constant pools to 4-byte boundaries).
+2.  There might be some compile-time efficiency to be had by representing
+consecutive islands as a single block rather than multiple blocks.
+
+3.  Use a priority queue to sort constant pool users in inverse order of
+    position so we always process the one closed to the end of functions
+    first. This may simply CreateNewWater.
  
  //===---------------------------------------------------------------------===//
  
-We need to start generating predicated instructions.  The .td files have a way
-to express this now (see the PPC conditional return instruction), but the 
-branch folding pass (or a new if-cvt pass) should start producing these, at
-least in the trivial case.
+Eliminate copysign custom expansion. We are still generating crappy code with
+default expansion + if-conversion.
  
-Among the obvious wins, doing so can eliminate the need to custom expand 
-copysign (i.e. we won't need to custom expand it to get the conditional
-negate).
+//===---------------------------------------------------------------------===//
  
-This allows us to eliminate one instruction from:
+Eliminate one instruction from:
  
  define i32 @_Z6slow4bii(i32 %x, i32 %y) {
          %tmp = icmp sgt i32 %x, %y
@@ -58,6 +61,12 @@ __Z6slow4bii:
          movgt r1, r0
          mov r0, r1
          bx lr
+=>
+
+__Z6slow4bii:
+        cmp r0, r1
+        movle r0, r1
+        bx lr
  
  //===---------------------------------------------------------------------===//
  
@@ -144,6 +153,29 @@ odd/even pair. However, we probably would pay a penalty if the address is not
  aligned on 8-byte boundary. This requires more information on load / store
  nodes (and MI's?) then we currently carry.
  
+6) struct copies appear to be done field by field 
+instead of by words, at least sometimes:
+
+struct foo { int x; short s; char c1; char c2; };
+void cpy(struct foo*a, struct foo*b) { *a = *b; }
+
+llvm code (-O2)
+        ldrb r3, [r1, #+6]
+        ldr r2, [r1]
+        ldrb r12, [r1, #+7]
+        ldrh r1, [r1, #+4]
+        str r2, [r0]
+        strh r1, [r0, #+4]
+        strb r3, [r0, #+6]
+        strb r12, [r0, #+7]
+gcc code (-O2)
+        ldmia   r1, {r1-r2}
+        stmia   r0, {r1-r2}
+
+In this benchmark poor handling of aggregate copies has shown up as
+having a large effect on size, and possibly speed as well (we don't have
+a good way to measure on ARM).
+
  //===---------------------------------------------------------------------===//
  
  * Consider this silly example:
@@ -154,24 +186,19 @@ double bar(double x) {
  }
  
  _bar:
-       sub sp, sp, #16
-       str r4, [sp, #+12]
-       str r5, [sp, #+8]
-       str lr, [sp, #+4]
-       mov r4, r0
-       mov r5, r1
-       ldr r0, LCPI2_0
-       bl _foo
-       fmsr f0, r0
-       fcvtsd d0, f0
-       fmdrr d1, r4, r5
-       faddd d0, d0, d1
-       fmrrd r0, r1, d0
-       ldr lr, [sp, #+4]
-       ldr r5, [sp, #+8]
-       ldr r4, [sp, #+12]
-       add sp, sp, #16
-       bx lr
+        stmfd sp!, {r4, r5, r7, lr}
+        add r7, sp, #8
+        mov r4, r0
+        mov r5, r1
+        fldd d0, LCPI1_0
+        fmrrd r0, r1, d0
+        bl _foo
+        fmdrr d0, r4, r5
+        fmsr s2, r0
+        fsitod d1, s2
+        faddd d0, d1, d0
+        fmrrd r0, r1, d0
+        ldmfd sp!, {r4, r5, r7, pc}
  
  Ignore the prologue and epilogue stuff for a second. Note 
         mov r4, r0
@@ -286,53 +313,8 @@ See McCat/18-imp/ComputeBoundingBoxes for an example.
  
  //===---------------------------------------------------------------------===//
  
-We need register scavenging.  Currently, the 'ip' register is reserved in case
-frame indexes are too big.  This means that we generate extra code for stuff 
-like this:
-
-void foo(unsigned x, unsigned y, unsigned z, unsigned *a, unsigned *b, unsigned *c) { 
-   short Rconst = (short) (16384.0f * 1.40200 + 0.5 );
-   *a = x * Rconst;
-   *b = y * Rconst;
-   *c = z * Rconst;
-}
-
-we compile it to:
-
-_foo:
-***     stmfd sp!, {r4, r7}
-***     add r7, sp, #4
-        mov r4, #186
-        orr r4, r4, #89, 24 @ 22784
-        mul r0, r0, r4
-        str r0, [r3]
-        mul r0, r1, r4
-        ldr r1, [sp, #+8]
-        str r0, [r1]
-        mul r0, r2, r4
-        ldr r1, [sp, #+12]
-        str r0, [r1]
-***     sub sp, r7, #4
-***     ldmfd sp!, {r4, r7}
-        bx lr
-
-GCC produces:
-
-_foo:
-        ldr     ip, L4
-        mul     r0, ip, r0
-        mul     r1, ip, r1
-        str     r0, [r3, #0]
-        ldr     r3, [sp, #0]
-        mul     r2, ip, r2
-        str     r1, [r3, #0]
-        ldr     r3, [sp, #4]
-        str     r2, [r3, #0]
-        bx      lr
-L4:
-        .long   22970
-
-This is apparently all because we couldn't use ip here.
+Register scavenging is now implemented.  The example in the previous version
+of this document produces optimal code at -O2.
  
  //===---------------------------------------------------------------------===//
  
@@ -453,3 +435,120 @@ http://www.inf.u-szeged.hu/gcc-arm/
  http://citeseer.ist.psu.edu/debus04linktime.html
  
  //===---------------------------------------------------------------------===//
+
+gcc generates smaller code for this function at -O2 or -Os:
+
+void foo(signed char* p) {
+  if (*p == 3)
+     bar();
+   else if (*p == 4)
+    baz();
+  else if (*p == 5)
+    quux();
+}
+
+llvm decides it's a good idea to turn the repeated if...else into a
+binary tree, as if it were a switch; the resulting code requires -1 
+compare-and-branches when *p<=2 or *p==5, the same number if *p==4
+or *p>6, and +1 if *p==3.  So it should be a speed win
+(on balance).  However, the revised code is larger, with 4 conditional 
+branches instead of 3.
+
+More seriously, there is a byte->word extend before
+each comparison, where there should be only one, and the condition codes
+are not remembered when the same two values are compared twice.
+
+//===---------------------------------------------------------------------===//
+
+More register scavenging work:
+
+1. Use the register scavenger to track frame index materialized into registers
+   (those that do not fit in addressing modes) to allow reuse in the same BB.
+2. Finish scavenging for Thumb.
+3. We know some spills and restores are unnecessary. The issue is once live
+   intervals are merged, they are not never split. So every def is spilled
+   and every use requires a restore if the register allocator decides the
+   resulting live interval is not assigned a physical register. It may be
+   possible (with the help of the scavenger) to turn some spill / restore
+   pairs into register copies.
+
+//===---------------------------------------------------------------------===//
+
+More LSR enhancements possible:
+
+1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged
+   in a load / store.
+2. Allow iv reuse even when a type conversion is required. For example, i8
+   and i32 load / store addressing modes are identical.
+
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+int foo(int a, int b, int c, int d) {
+  long long acc = (long long)a * (long long)b;
+  acc += (long long)c * (long long)d;
+  return (int)(acc >> 32);
+}
+
+Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies 
+two signed 32-bit values to produce a 64-bit value, and accumulates this with 
+a 64-bit value.
+
+We currently get this with both v4 and v6:
+
+_foo:
+        smull r1, r0, r1, r0
+        smull r3, r2, r3, r2
+        adds r3, r3, r1
+        adc r0, r2, r0
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
+_Z8full_addjj:
+       adds    r2, r1, r2
+       movcc   r1, #0
+       movcs   r1, #1
+       str     r2, [r0, #0]
+       strb    r1, [r0, #4]
+       mov     pc, lr
+
+_Z11no_overflowjj:
+       cmn     r0, r1
+       movcs   r0, #0
+       movcc   r0, #1
+       mov     pc, lr
+
+not:
+
+__Z8full_addjj:
+        add r3, r2, r1
+        str r3, [r0]
+        mov r2, #1
+        mov r12, #0
+        cmp r3, r1
+        movlo r12, r2
+        str r12, [r0, #+4]
+        bx lr
+__Z11no_overflowjj:
+        add r3, r1, r0
+        mov r2, #1
+        mov r1, #0
+        cmp r3, r0
+        movhs r1, r2
+        mov r0, r1
+        bx lr
+
+//===---------------------------------------------------------------------===//
+