Flag -> Glue, the ongoing saga

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index 7fa73edaefee2509128c39beb0e591949c7f4c87..c9561660b1c25b1b5433ad6ee9379e0953265814 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2,6 +2,38 @@ Target Independent Opportunities:
  
  //===---------------------------------------------------------------------===//
  
+We should recognize idioms for add-with-carry and turn it into the appropriate
+intrinsics.  This example:
+
+unsigned add32carry(unsigned sum, unsigned x) {
+ unsigned z = sum + x;
+ if (sum + x < x)
+     z++;
+ return z;
+}
+
+Compiles to: clang t.c -S -o - -O3 -fomit-frame-pointer -m64 -mkernel
+
+_add32carry:                            ## @add32carry
+       addl    %esi, %edi
+       cmpl    %esi, %edi
+       sbbl    %eax, %eax
+       andl    $1, %eax
+       addl    %edi, %eax
+       ret
+
+with clang, but to:
+
+_add32carry:
+       leal    (%rsi,%rdi), %eax
+       cmpl    %esi, %eax
+       adcl    $0, %eax
+       ret
+
+with gcc.
+
+//===---------------------------------------------------------------------===//
+
  Dead argument elimination should be enhanced to handle cases when an argument is
  dead to an externally visible function.  Though the argument can't be removed
  from the externally visible function, the caller doesn't need to pass it in.
@@ -41,7 +73,14 @@ This has a number of uses:
  
  //===---------------------------------------------------------------------===//
  
-Make the PPC branch selector target independant
+We should recognized various "overflow detection" idioms and translate them into
+llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
+
+unsigned int mul(unsigned int a,unsigned int b) {
+ if ((unsigned long long)a*b>0xffffffff)
+   exit(0);
+  return a*b;
+}
  
  //===---------------------------------------------------------------------===//
  
@@ -300,6 +339,14 @@ unsigned long reverse(unsigned v) {
      return v ^ (t >> 8);
  }
  
+Neither is this (very standard idiom):
+
+int f(int n)
+{
+  return (((n) << 24) | (((n) & 0xff00) << 8) 
+       | (((n) >> 8) & 0xff00) | ((n) >> 24));
+}
+
  //===---------------------------------------------------------------------===//
  
  [LOOP RECOGNITION]
@@ -650,7 +697,7 @@ implementations of ceil/floor/rint.
  Consider:
  
  int test() {
-  long long input[8] = {1,1,1,1,1,1,1,1};
+  long long input[8] = {1,0,1,0,1,0,1,0};
    foo(input);
  }
  
@@ -770,6 +817,51 @@ badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
  
  //===---------------------------------------------------------------------===//
  
+This (and similar related idioms):
+
+unsigned int foo(unsigned char i) {
+  return i | (i<<8) | (i<<16) | (i<<24);
+} 
+
+compiles into:
+
+define i32 @foo(i8 zeroext %i) nounwind readnone ssp noredzone {
+entry:
+  %conv = zext i8 %i to i32
+  %shl = shl i32 %conv, 8
+  %shl5 = shl i32 %conv, 16
+  %shl9 = shl i32 %conv, 24
+  %or = or i32 %shl9, %conv
+  %or6 = or i32 %or, %shl5
+  %or10 = or i32 %or6, %shl
+  ret i32 %or10
+}
+
+it would be better as:
+
+unsigned int bar(unsigned char i) {
+  unsigned int j=i | (i << 8); 
+  return j | (j<<16);
+}
+
+aka:
+
+define i32 @bar(i8 zeroext %i) nounwind readnone ssp noredzone {
+entry:
+  %conv = zext i8 %i to i32
+  %shl = shl i32 %conv, 8
+  %or = or i32 %shl, %conv
+  %shl5 = shl i32 %or, 16
+  %or6 = or i32 %shl5, %or
+  ret i32 %or6
+}
+
+or even i*0x01010101, depending on the speed of the multiplier.  The best way to
+handle this is to canonicalize it to a multiply in IR and have codegen handle
+lowering multiplies to shifts on cpus where shifts are faster.
+
+//===---------------------------------------------------------------------===//
+
  We do a number of simplifications in simplify libcalls to strength reduce
  standard library functions, but we don't currently merge them together.  For
  example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
@@ -898,27 +990,6 @@ The expression should optimize to something like
  
  //===---------------------------------------------------------------------===//
  
-From GCC Bug 3756:
-int
-pn (int n)
-{
- return (n >= 0 ? 1 : -1);
-}
-Should combine to (n >> 31) | 1.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
-void a(int variable)
-{
- if (variable == 4 || variable == 6)
-   bar();
-}
-This should optimize to "if ((variable | 2) == 6)".  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
  unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
  i;}
  unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
@@ -1036,18 +1107,6 @@ Should also combine to x | 8.  Currently not optimized with "clang
  
  //===---------------------------------------------------------------------===//
  
-int a(int x) {return (x & 8) == 0 ? -1 : -9;}
-Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
-int a(int x) {return (x & 8) == 0 ? -9 : -1;}
-Should combine to x | -9.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
  int a(int x) {return ((x | -9) ^ 8) & x;}
  Should combine to x & -9.  Currently not optimized with "clang
  -emit-llvm-bc | opt -std-compile-opts".
@@ -1219,6 +1278,29 @@ loadpre14.c loadpre15.c
  
  actually a conditional increment: loadpre18.c loadpre19.c
  
+//===---------------------------------------------------------------------===//
+
+[LOAD PRE / STORE SINKING / SPEC HACK]
+
+This is a chunk of code from 456.hmmer:
+
+int f(int M, int *mc, int *mpp, int *tpmm, int *ip, int *tpim, int *dpp,
+     int *tpdm, int xmb, int *bp, int *ms) {
+ int k, sc;
+ for (k = 1; k <= M; k++) {
+     mc[k] = mpp[k-1]   + tpmm[k-1];
+     if ((sc = ip[k-1]  + tpim[k-1]) > mc[k])  mc[k] = sc;
+     if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k])  mc[k] = sc;
+     if ((sc = xmb  + bp[k])         > mc[k])  mc[k] = sc;
+     mc[k] += ms[k];
+   }
+}
+
+It is very profitable for this benchmark to turn the conditional stores to mc[k]
+into a conditional move (select instr in IR) and allow the final store to do the
+store.  See GCC PR27313 for more details.  Note that this is valid to xform even
+with the new C++ memory model, since mc[k] is previously loaded and later
+stored.
  
  //===---------------------------------------------------------------------===//
  
@@ -1311,12 +1393,6 @@ void foo (int a, struct T b)
  
  simplifylibcalls should do several optimizations for strspn/strcspn:
  
-strcspn(x, "") -> strlen(x)
-strcspn("", x) -> 0
-strspn("", x) -> 0
-strspn(x, "") -> strlen(x)
-strspn(x, "a") -> strchr(x, 'a')-x
-
  strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
  
  size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
@@ -1356,14 +1432,7 @@ Those should be turned into a switch.
          
  This is interesting for a couple reasons.  First, in this:
  
-        %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
-        %strlen = call i32 @strlen(i8* %3072)  
-
-The strlen could be replaced with: %strlen = sub %3072, %3073, because the
-strcpy call returns a pointer to the end of the string.  Based on that, the
-endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
-
-Second, the memcpy+strlen strlen can be replaced with:
+The memcpy+strlen strlen can be replaced with:
  
          %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly 
  
@@ -1439,45 +1508,6 @@ This pattern repeats several times, basically doing:
  
  //===---------------------------------------------------------------------===//
  
-186.crafty contains this interesting pattern:
-
-%77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
-                       i8* %30)
-%phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
-br i1 %phitmp648, label %bb70, label %bb76
-
-bb70:           ; preds = %OptionMatch.exit91, %bb69
-        %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
-
-This is basically:
-  cststr = "abcdef";
-  if (strstr(cststr, P) == cststr) {
-     x = strlen(P);
-     ...
-
-The strstr call would be significantly cheaper written as:
-
-cststr = "abcdef";
-if (memcmp(P, str, strlen(P)))
-  x = strlen(P);
-
-This is memcmp+strlen instead of strstr.  This also makes the strlen fully
-redundant.
-
-//===---------------------------------------------------------------------===//
-
-186.crafty also contains this code:
-
-%1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
-%1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
-%1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
-%1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
-%1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909         
-
-The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
-
-//===---------------------------------------------------------------------===//
-
  186.crafty has this interesting pattern with the "out.4543" variable:
  
  call void @llvm.memcpy.i32(
@@ -1539,22 +1569,6 @@ the float directly.
  
  //===---------------------------------------------------------------------===//
  
-#include <math.h>
-double foo(double a) {    return sin(a); }
-
-This compiles into this on x86-64 Linux:
-foo:
-       subq    $8, %rsp
-       call    sin
-       addq    $8, %rsp
-       ret
-vs:
-
-foo:
-        jmp sin
-
-//===---------------------------------------------------------------------===//
-
  The arg promotion pass should make use of nocapture to make its alias analysis
  stuff much more precise.
  
@@ -1719,14 +1733,6 @@ This should be optimized to a single compare.  Testcase derived from gcc.
  
  //===---------------------------------------------------------------------===//
  
-Missed instcombine transformation:
-void b();
-void a(int x) { if (((1<<x)&8)==0) b(); }
-
-The shift should be optimized out.  Testcase derived from gcc.
-
-//===---------------------------------------------------------------------===//
-
  Missed instcombine or reassociate transformation:
  int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
  
@@ -1736,28 +1742,35 @@ from gcc.
  //===---------------------------------------------------------------------===//
  
  Missed instcombine transformation:
-define i32 @a(i32 %x) nounwind readnone {
-entry:
-  %rem = srem i32 %x, 32
-  %shl = shl i32 1, %rem
-  ret i32 %shl
-}
  
-The srem can be transformed to an and because if x is negative, the shift is
-undefined. Testcase derived from gcc.
+  %382 = srem i32 %tmp14.i, 64                    ; [#uses=1]
+  %383 = zext i32 %382 to i64                     ; [#uses=1]
+  %384 = shl i64 %381, %383                       ; [#uses=1]
+  %385 = icmp slt i32 %tmp14.i, 64                ; [#uses=1]
+
+The srem can be transformed to an and because if %tmp14.i is negative, the
+shift is undefined.  Testcase derived from 403.gcc.
  
  //===---------------------------------------------------------------------===//
  
-Missed instcombine/dagcombine transformation:
-define i32 @a(i32 %x, i32 %y) nounwind readnone {
-entry:
-  %mul = mul i32 %y, -8
-  %sub = sub i32 %x, %mul
-  ret i32 %sub
-}
+This is a range comparison on a divided result (from 403.gcc):
+
+  %1337 = sdiv i32 %1336, 8                       ; [#uses=1]
+  %.off.i208 = add i32 %1336, 7                   ; [#uses=1]
+  %1338 = icmp ult i32 %.off.i208, 15             ; [#uses=1]
+  
+We already catch this (removing the sdiv) if there isn't an add, we should
+handle the 'add' as well.  This is a common idiom with it's builtin_alloca code.
+C testcase:
  
-Should compile to something like x+y*8, but currently compiles to an
-inefficient result.  Testcase derived from gcc.
+int a(int x) { return (unsigned)(x/16+7) < 15; }
+
+Another similar case involves truncations on 64-bit targets:
+
+  %361 = sdiv i64 %.046, 8                        ; [#uses=1]
+  %362 = trunc i64 %361 to i32                    ; [#uses=2]
+...
+  %367 = icmp eq i32 %362, 0                      ; [#uses=1]
  
  //===---------------------------------------------------------------------===//
  
@@ -1863,3 +1876,192 @@ LLVM prefers comparisons with zero over non-zero in general, but in this
  case it choses instead to keep the max operation obvious.
  
  //===---------------------------------------------------------------------===//
+
+Take the following testcase on x86-64 (similar testcases exist for all targets
+with addc/adde):
+
+define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b,
+i64 %c) nounwind {
+entry:
+ %0 = zext i64 %a to i128                        ; <i128> [#uses=1]
+ %1 = zext i64 %b to i128                        ; <i128> [#uses=1]
+ %2 = add i128 %1, %0                            ; <i128> [#uses=2]
+ %3 = zext i64 %c to i128                        ; <i128> [#uses=1]
+ %4 = shl i128 %3, 64                            ; <i128> [#uses=1]
+ %5 = add i128 %4, %2                            ; <i128> [#uses=1]
+ %6 = lshr i128 %5, 64                           ; <i128> [#uses=1]
+ %7 = trunc i128 %6 to i64                       ; <i64> [#uses=1]
+ store i64 %7, i64* %s, align 8
+ %8 = trunc i128 %2 to i64                       ; <i64> [#uses=1]
+ store i64 %8, i64* %t, align 8
+ ret void
+}
+
+Generated code:
+       addq    %rcx, %rdx
+       movl    $0, %eax
+       adcq    $0, %rax
+       addq    %r8, %rax
+       movq    %rax, (%rdi)
+       movq    %rdx, (%rsi)
+       ret
+
+Expected code:
+       addq    %rcx, %rdx
+       adcq    $0, %r8
+       movq    %r8, (%rdi)
+       movq    %rdx, (%rsi)
+       ret
+
+The generated SelectionDAG has an ADD of an ADDE, where both operands of the
+ADDE are zero. Replacing one of the operands of the ADDE with the other operand
+of the ADD, and replacing the ADD with the ADDE, should give the desired result.
+
+(That said, we are doing a lot better than gcc on this testcase. :) )
+
+//===---------------------------------------------------------------------===//
+
+Switch lowering generates less than ideal code for the following switch:
+define void @a(i32 %x) nounwind {
+entry:
+  switch i32 %x, label %if.end [
+    i32 0, label %if.then
+    i32 1, label %if.then
+    i32 2, label %if.then
+    i32 3, label %if.then
+    i32 5, label %if.then
+  ]
+if.then:
+  tail call void @foo() nounwind
+  ret void
+if.end:
+  ret void
+}
+declare void @foo()
+
+Generated code on x86-64 (other platforms give similar results):
+a:
+       cmpl    $5, %edi
+       ja      .LBB0_2
+       movl    %edi, %eax
+       movl    $47, %ecx
+       btq     %rax, %rcx
+       jb      .LBB0_3
+.LBB0_2:
+       ret
+.LBB0_3:
+       jmp     foo  # TAILCALL
+
+The movl+movl+btq+jb could be simplified to a cmpl+jne.
+
+Or, if we wanted to be really clever, we could simplify the whole thing to
+something like the following, which eliminates a branch:
+       xorl    $1, %edi
+       cmpl    $4, %edi
+       ja      .LBB0_2
+       ret
+.LBB0_2:
+       jmp     foo  # TAILCALL
+//===---------------------------------------------------------------------===//
+Given a branch where the two target blocks are identical ("ret i32 %b" in
+both), simplifycfg will simplify them away. But not so for a switch statement:
+
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+        switch i32 %a, label %bb3 [
+                i32 4, label %bb
+                i32 6, label %bb
+        ]
+
+bb:             ; preds = %entry, %entry
+        ret i32 %b
+
+bb3:            ; preds = %entry
+        ret i32 %b
+}
+//===---------------------------------------------------------------------===//
+
+clang -O3 fails to devirtualize this virtual inheritance case: (GCC PR45875)
+Looks related to PR3100
+
+struct c1 {};
+struct c10 : c1{
+  virtual void foo ();
+};
+struct c11 : c10, c1{
+  virtual void f6 ();
+};
+struct c28 : virtual c11{
+  void f6 ();
+};
+void check_c28 () {
+  c28 obj;
+  c11 *ptr = &obj;
+  ptr->f6 ();
+}
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+int foo(int a) { return (a & (~15)) / 16; }
+
+Into:
+
+define i32 @foo(i32 %a) nounwind readnone ssp {
+entry:
+  %and = and i32 %a, -16
+  %div = sdiv i32 %and, 16
+  ret i32 %div
+}
+
+but this code (X & -A)/A is X >> log2(A) when A is a power of 2, so this case
+should be instcombined into just "a >> 4".
+
+We do get this at the codegen level, so something knows about it, but 
+instcombine should catch it earlier:
+
+_foo:                                   ## @foo
+## BB#0:                                ## %entry
+       movl    %edi, %eax
+       sarl    $4, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+This code (from GCC PR28685):
+
+int test(int a, int b) {
+  int lt = a < b;
+  int eq = a == b;
+  if (lt)
+    return 1;
+  return eq;
+}
+
+Is compiled to:
+
+define i32 @test(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %cmp5 = icmp eq i32 %a, %b
+  %conv6 = zext i1 %cmp5 to i32
+  ret i32 %conv6
+
+return:                                           ; preds = %entry
+  ret i32 1
+}
+
+it could be:
+
+define i32 @test__(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+  %0 = icmp sle i32 %a, %b
+  %retval = zext i1 %0 to i32
+  ret i32 %retval
+}
+
+//===---------------------------------------------------------------------===//