Fix the encoding for two more "rm" instructions that were using MRMSrcReg.

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index 80b00377ae71486baa1820baac2e001be1a3f1f9..1b55ac69bf56a77cd51e6dfbde26e7ec74478435 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -705,3 +705,113 @@ int f() {
  }
  
  //===---------------------------------------------------------------------===//
+
+The loop unroller should partially unroll loops (instead of peeling them)
+when code growth isn't too bad and when an unroll count allows simplification
+of some code within the loop.  One trivial example is:
+
+#include <stdio.h>
+int main() {
+    int nRet = 17;
+    int nLoop;
+    for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
+        if ( nLoop & 1 )
+            nRet += 2;
+        else
+            nRet -= 1;
+    }
+    return nRet;
+}
+
+Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
+reduction in code size.  The resultant code would then also be suitable for
+exit value computation.
+
+//===---------------------------------------------------------------------===//
+
+We miss a bunch of rotate opportunities on various targets, including ppc, x86,
+etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
+matching code in dag combine doesn't look through truncates aggressively 
+enough.  Here are some testcases reduces from GCC PR17886:
+
+unsigned long long f(unsigned long long x, int y) {
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f2(unsigned x, int y){
+  return (x << y) | (x >> 32-y); 
+} 
+unsigned long long f3(unsigned long long x){
+  int y = 9;
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f4(unsigned x){
+  int y = 10;
+  return (x << y) | (x >> 32-y); 
+}
+unsigned long long f5(unsigned long long x, unsigned long long y) {
+  return (x << 8) | ((y >> 48) & 0xffull);
+}
+unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
+  switch(z) {
+  case 1:
+    return (x << 8) | ((y >> 48) & 0xffull);
+  case 2:
+    return (x << 16) | ((y >> 40) & 0xffffull);
+  case 3:
+    return (x << 24) | ((y >> 32) & 0xffffffull);
+  case 4:
+    return (x << 32) | ((y >> 24) & 0xffffffffull);
+  default:
+    return (x << 40) | ((y >> 16) & 0xffffffffffull);
+  }
+}
+
+On X86-64, we only handle f3/f4 right.  On x86-32, several of these 
+generate truly horrible code, instead of using shld and friends.  On
+ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
+badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
+
+//===---------------------------------------------------------------------===//
+
+We do a number of simplifications in simplify libcalls to strength reduce
+standard library functions, but we don't currently merge them together.  For
+example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
+be done safely if "b" isn't modified between the strlen and memcpy of course.
+
+//===---------------------------------------------------------------------===//
+
+define i32 @test2(float %X, float %Y) {
+entry:
+        %tmp3 = fcmp uno float %X, %Y           ; <i1> [#uses=1]
+        %tmp34 = zext i1 %tmp3 to i8            ; <i8> [#uses=1]
+        %tmp = xor i8 %tmp34, 1         ; <i8> [#uses=1]
+        %toBoolnot5 = zext i8 %tmp to i32               ; <i32> [#uses=1]
+        ret i32 %toBoolnot5
+}
+
+could be optimized further. Instcombine should use its bitwise analysis to
+collapse the zext/xor/zext structure to an xor/zext and then remove the 
+xor by reversing the fcmp.
+
+Desired output:
+
+define i32 @test2(float %X, float %Y) {
+entry:
+        %tmp3 = fcmp ord float %X, %Y           ; <i1> [#uses=1]
+        %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
+        ret i32 %tmp34
+}
+
+To fix this, we need to make CanEvaluateInDifferentType smarter.
+
+//===---------------------------------------------------------------------===//
+
+We should be able to evaluate this loop:
+
+int test(int x_offs) {
+  while (x_offs > 4)
+     x_offs -= 4;
+  return x_offs;
+}
+
+//===---------------------------------------------------------------------===//