JIT support has been added awhile ago.

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index b27d99e18b5048187a62fac75f13633a412e2403..bf8d465d0aeebe9db2b1a2e5375fef188fa8853c 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2,78 +2,17 @@ Target Independent Opportunities:
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-We should recognize idioms for add-with-carry and turn it into the appropriate
-intrinsics.  This example:
+We should recognized various "overflow detection" idioms and translate them into
+llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
  
  
-unsigned add32carry(unsigned sum, unsigned x) {
- unsigned z = sum + x;
- if (sum + x < x)
-     z++;
- return z;
+unsigned int mul(unsigned int a,unsigned int b) {
+ if ((unsigned long long)a*b>0xffffffff)
+   exit(0);
+  return a*b;
  }
  
  }
  
-Compiles to: clang t.c -S -o - -O3 -fomit-frame-pointer -m64 -mkernel
-
-_add32carry:                            ## @add32carry
-       addl    %esi, %edi
-       cmpl    %esi, %edi
-       sbbl    %eax, %eax
-       andl    $1, %eax
-       addl    %edi, %eax
-       ret
-
-with clang, but to:
-
-_add32carry:
-       leal    (%rsi,%rdi), %eax
-       cmpl    %esi, %eax
-       adcl    $0, %eax
-       ret
-
-with gcc.
-
-//===---------------------------------------------------------------------===//
-
-Dead argument elimination should be enhanced to handle cases when an argument is
-dead to an externally visible function.  Though the argument can't be removed
-from the externally visible function, the caller doesn't need to pass it in.
-For example in this testcase:
-
-  void foo(int X) __attribute__((noinline));
-  void foo(int X) { sideeffect(); }
-  void bar(int A) { foo(A+1); }
-
-We compile bar to:
-
-define void @bar(i32 %A) nounwind ssp {
-  %0 = add nsw i32 %A, 1                          ; <i32> [#uses=1]
-  tail call void @foo(i32 %0) nounwind noinline ssp
-  ret void
-}
-
-The add is dead, we could pass in 'i32 undef' instead.  This occurs for C++
-templates etc, which usually have linkonce_odr/weak_odr linkage, not internal
-linkage.
-
-//===---------------------------------------------------------------------===//
-
-With the recent changes to make the implicit def/use set explicit in
-machineinstrs, we should change the target descriptions for 'call' instructions
-so that the .td files don't list all the call-clobbered registers as implicit
-defs.  Instead, these should be added by the code generator (e.g. on the dag).
-
-This has a number of uses:
-
-1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
-   for their different impdef sets.
-2. Targets with multiple calling convs (e.g. x86) which have different clobber
-   sets don't need copies of call instructions.
-3. 'Interprocedural register allocation' can be done to reduce the clobber sets
-   of calls.
-
-//===---------------------------------------------------------------------===//
-
-Make the PPC branch selector target independant
+The legalization code for mul-with-overflow needs to be made more robust before
+this can be implemented though.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -85,41 +24,6 @@ right).
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-Solve this DAG isel folding deficiency:
-
-int X, Y;
-
-void fn1(void)
-{
-  X = X | (Y << 3);
-}
-
-compiles to
-
-fn1:
-       movl Y, %eax
-       shll $3, %eax
-       orl X, %eax
-       movl %eax, X
-       ret
-
-The problem is the store's chain operand is not the load X but rather
-a TokenFactor of the load X and load Y, which prevents the folding.
-
-There are two ways to fix this:
-
-1. The dag combiner can start using alias analysis to realize that y/x
-   don't alias, making the store to X not dependent on the load from Y.
-2. The generated isel could be made smarter in the case it can't
-   disambiguate the pointers.
-
-Number 1 is the preferred solution.
-
-This has been "fixed" by a TableGen hack. But that is a short term workaround
-which will be removed once the proper fix is made.
-
-//===---------------------------------------------------------------------===//
-
  On targets with expensive 64-bit multiply, we could LSR this:
  
  for (i = ...; ++i) {
  On targets with expensive 64-bit multiply, we could LSR this:
  
  for (i = ...; ++i) {
@@ -189,44 +93,6 @@ This requires reassociating to forms of expressions that are already available,
  something that reassoc doesn't think about yet.
  
  
  something that reassoc doesn't think about yet.
  
  
-//===---------------------------------------------------------------------===//
-
-This function: (derived from GCC PR19988)
-double foo(double x, double y) {
-  return ((x + 0.1234 * y) * (x + -0.1234 * y));
-}
-
-compiles to:
-_foo:
-       movapd  %xmm1, %xmm2
-       mulsd   LCPI1_1(%rip), %xmm1
-       mulsd   LCPI1_0(%rip), %xmm2
-       addsd   %xmm0, %xmm1
-       addsd   %xmm0, %xmm2
-       movapd  %xmm1, %xmm0
-       mulsd   %xmm2, %xmm0
-       ret
-
-Reassociate should be able to turn it into:
-
-double foo(double x, double y) {
-  return ((x + 0.1234 * y) * (x - 0.1234 * y));
-}
-
-Which allows the multiply by constant to be CSE'd, producing:
-
-_foo:
-       mulsd   LCPI1_0(%rip), %xmm1
-       movapd  %xmm1, %xmm2
-       addsd   %xmm0, %xmm2
-       subsd   %xmm1, %xmm0
-       mulsd   %xmm2, %xmm0
-       ret
-
-This doesn't need -ffast-math support at all.  This is particularly bad because
-the llvm-gcc frontend is canonicalizing the later into the former, but clang
-doesn't have this problem.
-
  //===---------------------------------------------------------------------===//
  
  These two functions should generate the same code on big-endian systems:
  //===---------------------------------------------------------------------===//
  
  These two functions should generate the same code on big-endian systems:
@@ -248,7 +114,7 @@ stuff too.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal
  to the type size. It works but can be overly conservative as the alignment of
  specific vector types are target dependent.
  
  to the type size. It works but can be overly conservative as the alignment of
  specific vector types are target dependent.
  
@@ -332,12 +198,18 @@ unsigned long reverse(unsigned v) {
      return v ^ (t >> 8);
  }
  
      return v ^ (t >> 8);
  }
  
-Neither is this (very standard idiom):
+//===---------------------------------------------------------------------===//
+
+[LOOP DELETION]
  
  
-int f(int n)
-{
-  return (((n) << 24) | (((n) & 0xff00) << 8) 
-       | (((n) >> 8) & 0xff00) | ((n) >> 24));
+We don't delete this output free loop, because trip count analysis doesn't
+realize that it is finite (if it were infinite, it would be undefined).  Not
+having this blocks Loop Idiom from matching strlen and friends.  
+
+void foo(char *C) {
+  int x = 0;
+  while (*C)
+    ++x,++C;
  }
  
  //===---------------------------------------------------------------------===//
  }
  
  //===---------------------------------------------------------------------===//
@@ -352,22 +224,7 @@ unsigned countbits_slow(unsigned v) {
      c += v & 1;
    return c;
  }
      c += v & 1;
    return c;
  }
-unsigned countbits_fast(unsigned v){
-  unsigned c;
-  for (c = 0; v; c++)
-    v &= v - 1; // clear the least significant bit set
-  return c;
-}
  
  
-BITBOARD = unsigned long long
-int PopCnt(register BITBOARD a) {
-  register int c=0;
-  while(a) {
-    c++;
-    a &= a - 1;
-  }
-  return c;
-}
  unsigned int popcount(unsigned int input) {
    unsigned int count = 0;
    for (unsigned int i =  0; i < 4 * 8; i++)
  unsigned int popcount(unsigned int input) {
    unsigned int count = 0;
    for (unsigned int i =  0; i < 4 * 8; i++)
@@ -375,8 +232,17 @@ unsigned int popcount(unsigned int input) {
    return count;
  }
  
    return count;
  }
  
-This is a form of idiom recognition for loops, the same thing that could be
-useful for recognizing memset/memcpy.
+This should be recognized as CLZ:  rdar://8459039
+
+unsigned clz_a(unsigned a) {
+  int i;
+  for (i=0;i<32;i++)
+    if (a & (1<<(31-i)))
+      return i;
+  return 32;
+}
+
+This sort of thing should be added to the loop idiom pass.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -406,14 +272,6 @@ this construct.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-[LOOP RECOGNITION]
-
-viterbi speeds up *significantly* if the various "history" related copy loops
-are turned into memcpy calls at the source level.  We need a "loops to memcpy"
-pass.
-
-//===---------------------------------------------------------------------===//
-
  [LOOP OPTIMIZATION]
  
  SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
  [LOOP OPTIMIZATION]
  
  SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
@@ -465,34 +323,6 @@ PHI Slicing could be extended to do this.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-LSR should know what GPR types a target has from TargetData.  This code:
-
-volatile short X, Y; // globals
-
-void foo(int N) {
-  int i;
-  for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-produces two near identical IV's (after promotion) on PPC/ARM:
-
-LBB1_2:
-       ldr r3, LCPI1_0
-       ldr r3, [r3]
-       strh r2, [r3]
-       ldr r3, LCPI1_1
-       ldr r3, [r3]
-       strh r1, [r3]
-       add r1, r1, #4
-       add r2, r2, #1   <- [0,+,1]
-       sub r0, r0, #1   <- [0,-,1]
-       cmp r0, #0
-       bne LBB1_2
-
-LSR should reuse the "+" IV for the exit test.
-
-//===---------------------------------------------------------------------===//
-
  Tail call elim should be more aggressive, checking to see if the call is
  followed by an uncond branch to an exit block.
  
  Tail call elim should be more aggressive, checking to see if the call is
  followed by an uncond branch to an exit block.
  
@@ -639,46 +469,21 @@ struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
  extern THotKey m_HotKey;
  THotKey GetHotKey () { return m_HotKey; }
  
  extern THotKey m_HotKey;
  THotKey GetHotKey () { return m_HotKey; }
  
-into (-O3 -fno-exceptions -static -fomit-frame-pointer):
-
-__Z9GetHotKeyv:
-       pushl   %esi
-       movl    8(%esp), %eax
-       movb    _m_HotKey+3, %cl
-       movb    _m_HotKey+4, %dl
-       movb    _m_HotKey+2, %ch
-       movw    _m_HotKey, %si
-       movw    %si, (%eax)
-       movb    %ch, 2(%eax)
-       movb    %cl, 3(%eax)
-       movb    %dl, 4(%eax)
-       popl    %esi
-       ret     $4
-
-GCC produces:
-
-__Z9GetHotKeyv:
-       movl    _m_HotKey, %edx
-       movl    4(%esp), %eax
-       movl    %edx, (%eax)
-       movzwl  _m_HotKey+4, %edx
-       movw    %dx, 4(%eax)
-       ret     $4
-
-The LLVM IR contains the needed alignment info, so we should be able to 
-merge the loads and stores into 4-byte loads:
-
-       %struct.THotKey = type { i16, i8, i8, i8 }
-define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
-...
-       %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
-       %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
-       %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
-       %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
-
-Alternatively, we should use a small amount of base-offset alias analysis
-to make it so the scheduler doesn't need to hold all the loads in regs at
-once.
+into (-m64 -O3 -fno-exceptions -static -fomit-frame-pointer):
+
+__Z9GetHotKeyv:                         ## @_Z9GetHotKeyv
+       movq    _m_HotKey@GOTPCREL(%rip), %rax
+       movzwl  (%rax), %ecx
+       movzbl  2(%rax), %edx
+       shlq    $16, %rdx
+       orq     %rcx, %rdx
+       movzbl  3(%rax), %ecx
+       shlq    $24, %rcx
+       orq     %rdx, %rcx
+       movzbl  4(%rax), %eax
+       shlq    $32, %rax
+       orq     %rcx, %rax
+       ret
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -690,42 +495,35 @@ implementations of ceil/floor/rint.
  Consider:
  
  int test() {
  Consider:
  
  int test() {
-  long long input[8] = {1,1,1,1,1,1,1,1};
+  long long input[8] = {1,0,1,0,1,0,1,0};
    foo(input);
  }
  
    foo(input);
  }
  
-We currently compile this into a memcpy from a global array since the 
-initializer is fairly large and not memset'able.  This is good, but the memcpy
-gets lowered to load/stores in the code generator.  This is also ok, except
-that the codegen lowering for memcpy doesn't handle the case when the source
-is a constant global.  This gives us atrocious code like this:
+Clang compiles this into:
  
  
-       call    "L1$pb"
-"L1$pb":
-       popl    %eax
-       movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
-       movl    %ecx, 40(%esp)
-       movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
-       movl    %ecx, 28(%esp)
-       movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
-       movl    %ecx, 44(%esp)
-       movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
-       movl    %ecx, 52(%esp)
-       movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
-       movl    %ecx, 48(%esp)
-       movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
-       movl    %ecx, 20(%esp)
-       movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
-...
+  call void @llvm.memset.p0i8.i64(i8* %tmp, i8 0, i64 64, i32 16, i1 false)
+  %0 = getelementptr [8 x i64]* %input, i64 0, i64 0
+  store i64 1, i64* %0, align 16
+  %1 = getelementptr [8 x i64]* %input, i64 0, i64 2
+  store i64 1, i64* %1, align 16
+  %2 = getelementptr [8 x i64]* %input, i64 0, i64 4
+  store i64 1, i64* %2, align 16
+  %3 = getelementptr [8 x i64]* %input, i64 0, i64 6
+  store i64 1, i64* %3, align 16
+
+Which gets codegen'd into:
+
+       pxor    %xmm0, %xmm0
+       movaps  %xmm0, -16(%rbp)
+       movaps  %xmm0, -32(%rbp)
+       movaps  %xmm0, -48(%rbp)
+       movaps  %xmm0, -64(%rbp)
+       movq    $1, -64(%rbp)
+       movq    $1, -48(%rbp)
+       movq    $1, -32(%rbp)
+       movq    $1, -16(%rbp)
  
  
-instead of:
-       movl    $1, 16(%esp)
-       movl    $0, 20(%esp)
-       movl    $1, 24(%esp)
-       movl    $0, 28(%esp)
-       movl    $1, 32(%esp)
-       movl    $0, 36(%esp)
-       ...
+It would be better to have 4 movq's of 0 instead of the movaps's.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -771,20 +569,6 @@ etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
  matching code in dag combine doesn't look through truncates aggressively 
  enough.  Here are some testcases reduces from GCC PR17886:
  
  matching code in dag combine doesn't look through truncates aggressively 
  enough.  Here are some testcases reduces from GCC PR17886:
  
-unsigned long long f(unsigned long long x, int y) {
-  return (x << y) | (x >> 64-y); 
-} 
-unsigned f2(unsigned x, int y){
-  return (x << y) | (x >> 32-y); 
-} 
-unsigned long long f3(unsigned long long x){
-  int y = 9;
-  return (x << y) | (x >> 64-y); 
-} 
-unsigned f4(unsigned x){
-  int y = 10;
-  return (x << y) | (x >> 32-y); 
-}
  unsigned long long f5(unsigned long long x, unsigned long long y) {
    return (x << 8) | ((y >> 48) & 0xffull);
  }
  unsigned long long f5(unsigned long long x, unsigned long long y) {
    return (x << 8) | ((y >> 48) & 0xffull);
  }
@@ -803,10 +587,50 @@ unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
    }
  }
  
    }
  }
  
-On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these 
-generate truly horrible code, instead of using shld and friends.  On
-ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
-badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
+//===---------------------------------------------------------------------===//
+
+This (and similar related idioms):
+
+unsigned int foo(unsigned char i) {
+  return i | (i<<8) | (i<<16) | (i<<24);
+} 
+
+compiles into:
+
+define i32 @foo(i8 zeroext %i) nounwind readnone ssp noredzone {
+entry:
+  %conv = zext i8 %i to i32
+  %shl = shl i32 %conv, 8
+  %shl5 = shl i32 %conv, 16
+  %shl9 = shl i32 %conv, 24
+  %or = or i32 %shl9, %conv
+  %or6 = or i32 %or, %shl5
+  %or10 = or i32 %or6, %shl
+  ret i32 %or10
+}
+
+it would be better as:
+
+unsigned int bar(unsigned char i) {
+  unsigned int j=i | (i << 8); 
+  return j | (j<<16);
+}
+
+aka:
+
+define i32 @bar(i8 zeroext %i) nounwind readnone ssp noredzone {
+entry:
+  %conv = zext i8 %i to i32
+  %shl = shl i32 %conv, 8
+  %or = or i32 %shl, %conv
+  %shl5 = shl i32 %or, 16
+  %or6 = or i32 %shl5, %or
+  ret i32 %or6
+}
+
+or even i*0x01010101, depending on the speed of the multiplier.  The best way to
+handle this is to canonicalize it to a multiply in IR and have codegen handle
+lowering multiplies to shifts on cpus where shifts are faster.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -836,18 +660,6 @@ codegen badness or something else (haven't investigated).
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-We miss some instcombines for stuff like this:
-void bar (void);
-void foo (unsigned int a) {
-  /* This one is equivalent to a >= (3 << 2).  */
-  if ((a >> 2) >= 3)
-    bar ();
-}
-
-A few other related ones are in GCC PR14753.
-
-//===---------------------------------------------------------------------===//
-
  Divisibility by constant can be simplified (according to GCC PR12849) from
  being a mulhi to being a mul lo (cheaper).  Testcase:
  
  Divisibility by constant can be simplified (according to GCC PR12849) from
  being a mulhi to being a mul lo (cheaper).  Testcase:
  
@@ -938,16 +750,6 @@ The expression should optimize to something like
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-void a(int variable)
-{
- if (variable == 4 || variable == 6)
-   bar();
-}
-This should optimize to "if ((variable | 2) == 6)".  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
  unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
  i;}
  unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
  unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
  i;}
  unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
@@ -998,6 +800,7 @@ rshift_gt (unsigned int a)
   if ((a >> 2) > 5)
     bar ();
  }
   if ((a >> 2) > 5)
     bar ();
  }
+
  All should simplify to a single comparison.  All of these are
  currently not optimized with "clang -emit-llvm-bc | opt
  -std-compile-opts".
  All should simplify to a single comparison.  All of these are
  currently not optimized with "clang -emit-llvm-bc | opt
  -std-compile-opts".
@@ -1065,18 +868,6 @@ Should also combine to x | 8.  Currently not optimized with "clang
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-int a(int x) {return (x & 8) == 0 ? -1 : -9;}
-Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
-int a(int x) {return (x & 8) == 0 ? -9 : -1;}
-Should combine to x | -9.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
  int a(int x) {return ((x | -9) ^ 8) & x;}
  Should combine to x & -9.  Currently not optimized with "clang
  -emit-llvm-bc | opt -std-compile-opts".
  int a(int x) {return ((x | -9) ^ 8) & x;}
  Should combine to x & -9.  Currently not optimized with "clang
  -emit-llvm-bc | opt -std-compile-opts".
@@ -1101,6 +892,31 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
+int g(int x) { return (x - 10) < 0; }
+Should combine to "x <= 9" (the sub has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int g(int x) { return (x + 10) < 0; }
+Should combine to "x < -10" (the add has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int f(int i, int j) { return i < j + 1; }
+int g(int i, int j) { return j > i - 1; }
+Should combine to "i <= j" (the add/sub has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
+The & 15 part should be optimized away, it doesn't change the result. Currently
+not optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
  This was noticed in the entryblock for grokdeclarator in 403.gcc:
  
          %tmp = icmp eq i32 %decl_context, 4          
  This was noticed in the entryblock for grokdeclarator in 403.gcc:
  
          %tmp = icmp eq i32 %decl_context, 4          
@@ -1177,6 +993,77 @@ int test (int a, int b, int c, int g) {
  It would be better to do the mul once to reduce codesize above the if.
  This is GCC PR38204.
  
  It would be better to do the mul once to reduce codesize above the if.
  This is GCC PR38204.
  
+
+//===---------------------------------------------------------------------===//
+This simple function from 179.art:
+
+int winner, numf2s;
+struct { double y; int   reset; } *Y;
+
+void find_match() {
+   int i;
+   winner = 0;
+   for (i=0;i<numf2s;i++)
+       if (Y[i].y > Y[winner].y)
+              winner =i;
+}
+
+Compiles into (with clang TBAA):
+
+for.body:                                         ; preds = %for.inc, %bb.nph
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.inc ]
+  %i.01718 = phi i32 [ 0, %bb.nph ], [ %i.01719, %for.inc ]
+  %tmp4 = getelementptr inbounds %struct.anon* %tmp3, i64 %indvar, i32 0
+  %tmp5 = load double* %tmp4, align 8, !tbaa !4
+  %idxprom7 = sext i32 %i.01718 to i64
+  %tmp10 = getelementptr inbounds %struct.anon* %tmp3, i64 %idxprom7, i32 0
+  %tmp11 = load double* %tmp10, align 8, !tbaa !4
+  %cmp12 = fcmp ogt double %tmp5, %tmp11
+  br i1 %cmp12, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %i.017 = trunc i64 %indvar to i32
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %i.01719 = phi i32 [ %i.01718, %for.body ], [ %i.017, %if.then ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %tmp22
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+
+It is good that we hoisted the reloads of numf2's, and Y out of the loop and
+sunk the store to winner out.
+
+However, this is awful on several levels: the conditional truncate in the loop
+(-indvars at fault? why can't we completely promote the IV to i64?).
+
+Beyond that, we have a partially redundant load in the loop: if "winner" (aka 
+%i.01718) isn't updated, we reload Y[winner].y the next time through the loop.
+Similarly, the addressing that feeds it (including the sext) is redundant. In
+the end we get this generated assembly:
+
+LBB0_2:                                 ## %for.body
+                                        ## =>This Inner Loop Header: Depth=1
+       movsd   (%rdi), %xmm0
+       movslq  %edx, %r8
+       shlq    $4, %r8
+       ucomisd (%rcx,%r8), %xmm0
+       jbe     LBB0_4
+       movl    %esi, %edx
+LBB0_4:                                 ## %for.inc
+       addq    $16, %rdi
+       incq    %rsi
+       cmpq    %rsi, %rax
+       jne     LBB0_2
+
+All things considered this isn't too bad, but we shouldn't need the movslq or
+the shlq instruction, or the load folded into ucomisd every time through the
+loop.
+
+On an x86-specific topic, if the loop can't be restructure, the movl should be a
+cmov.
+
  //===---------------------------------------------------------------------===//
  
  [STORE SINKING]
  //===---------------------------------------------------------------------===//
  
  [STORE SINKING]
@@ -1238,7 +1125,7 @@ There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
  GCC testsuite, ones we don't get yet are (checked through loadpre25):
  
  [CRIT EDGE BREAKING]
  GCC testsuite, ones we don't get yet are (checked through loadpre25):
  
  [CRIT EDGE BREAKING]
-loadpre3.c predcom-4.c
+predcom-4.c
  
  [PRE OF READONLY CALL]
  loadpre5.c
  
  [PRE OF READONLY CALL]
  loadpre5.c
@@ -1248,6 +1135,29 @@ loadpre14.c loadpre15.c
  
  actually a conditional increment: loadpre18.c loadpre19.c
  
  
  actually a conditional increment: loadpre18.c loadpre19.c
  
+//===---------------------------------------------------------------------===//
+
+[LOAD PRE / STORE SINKING / SPEC HACK]
+
+This is a chunk of code from 456.hmmer:
+
+int f(int M, int *mc, int *mpp, int *tpmm, int *ip, int *tpim, int *dpp,
+     int *tpdm, int xmb, int *bp, int *ms) {
+ int k, sc;
+ for (k = 1; k <= M; k++) {
+     mc[k] = mpp[k-1]   + tpmm[k-1];
+     if ((sc = ip[k-1]  + tpim[k-1]) > mc[k])  mc[k] = sc;
+     if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k])  mc[k] = sc;
+     if ((sc = xmb  + bp[k])         > mc[k])  mc[k] = sc;
+     mc[k] += ms[k];
+   }
+}
+
+It is very profitable for this benchmark to turn the conditional stores to mc[k]
+into a conditional move (select instr in IR) and allow the final store to do the
+store.  See GCC PR27313 for more details.  Note that this is valid to xform even
+with the new C++ memory model, since mc[k] is previously loaded and later
+stored.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -1293,26 +1203,6 @@ SingleSource/Benchmarks/Misc/dt.c
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-A/B get pinned to the stack because we turn an if/then into a select instead
-of PRE'ing the load/store.  This may be fixable in instcombine:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
-
-struct X { int i; };
-int foo (int x) {
-  struct X a;
-  struct X b;
-  struct X *p;
-  a.i = 1;
-  b.i = 2;
-  if (x)
-    p = &a;
-  else
-    p = &b;
-  return p->i;
-}
-
-//===---------------------------------------------------------------------===//
-
  Interesting missed case because of control flow flattening (should be 2 loads):
  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
  With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | 
  Interesting missed case because of control flow flattening (should be 2 loads):
  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
  With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | 
@@ -1358,6 +1248,21 @@ codegen.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
+simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917)
+
+char buf1[6], buf2[6], buf3[4], buf4[4];
+int i;
+
+int foo (void) {
+  int ret = snprintf (buf1, sizeof buf1, "abcde");
+  ret += snprintf (buf2, sizeof buf2, "abcdef") * 16;
+  ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256;
+  ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096;
+  return ret;
+}
+
+//===---------------------------------------------------------------------===//
+
  "gas" uses this idiom:
    else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
  ..
  "gas" uses this idiom:
    else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
  ..
@@ -1379,14 +1284,7 @@ Those should be turned into a switch.
          
  This is interesting for a couple reasons.  First, in this:
  
          
  This is interesting for a couple reasons.  First, in this:
  
-        %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
-        %strlen = call i32 @strlen(i8* %3072)  
-
-The strlen could be replaced with: %strlen = sub %3072, %3073, because the
-strcpy call returns a pointer to the end of the string.  Based on that, the
-endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
-
-Second, the memcpy+strlen strlen can be replaced with:
+The memcpy+strlen strlen can be replaced with:
  
          %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly 
  
  
          %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly 
  
@@ -1462,18 +1360,6 @@ This pattern repeats several times, basically doing:
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-186.crafty also contains this code:
-
-%1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
-%1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
-%1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
-%1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
-%1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909         
-
-The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
-
-//===---------------------------------------------------------------------===//
-
  186.crafty has this interesting pattern with the "out.4543" variable:
  
  call void @llvm.memcpy.i32(
  186.crafty has this interesting pattern with the "out.4543" variable:
  
  call void @llvm.memcpy.i32(
@@ -1535,22 +1421,6 @@ the float directly.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-#include <math.h>
-double foo(double a) {    return sin(a); }
-
-This compiles into this on x86-64 Linux:
-foo:
-       subq    $8, %rsp
-       call    sin
-       addq    $8, %rsp
-       ret
-vs:
-
-foo:
-        jmp sin
-
-//===---------------------------------------------------------------------===//
-
  The arg promotion pass should make use of nocapture to make its alias analysis
  stuff much more precise.
  
  The arg promotion pass should make use of nocapture to make its alias analysis
  stuff much more precise.
  
@@ -1670,21 +1540,6 @@ int bar() { return foo("abcd"); }
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-InstCombine should use SimplifyDemandedBits to remove the or instruction:
-
-define i1 @test(i8 %x, i8 %y) {
-  %A = or i8 %x, 1
-  %B = icmp ugt i8 %A, 3
-  ret i1 %B
-}
-
-Currently instcombine calls SimplifyDemandedBits with either all bits or just
-the sign bit, if the comparison is obviously a sign test. In this case, we only
-need all but the bottom two bits from %A, and if we gave that mask to SDB it
-would delete the or instruction for us.
-
-//===---------------------------------------------------------------------===//
-
  functionattrs doesn't know much about memcpy/memset.  This function should be
  marked readnone rather than readonly, since it only twiddles local memory, but
  functionattrs doesn't handle memset/memcpy/memmove aggressively:
  functionattrs doesn't know much about memcpy/memset.  This function should be
  marked readnone rather than readonly, since it only twiddles local memory, but
  functionattrs doesn't handle memset/memcpy/memmove aggressively:
@@ -1700,6 +1555,10 @@ int foo() {
   return **p;
  }
  
   return **p;
  }
  
+This can be seen at:
+$ clang t.c -S -o - -mkernel -O0 -emit-llvm | opt -functionattrs -S
+
+
  //===---------------------------------------------------------------------===//
  
  Missed instcombine transformation:
  //===---------------------------------------------------------------------===//
  
  Missed instcombine transformation:
@@ -1715,14 +1574,6 @@ This should be optimized to a single compare.  Testcase derived from gcc.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-Missed instcombine transformation:
-void b();
-void a(int x) { if (((1<<x)&8)==0) b(); }
-
-The shift should be optimized out.  Testcase derived from gcc.
-
-//===---------------------------------------------------------------------===//
-
  Missed instcombine or reassociate transformation:
  int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
  
  Missed instcombine or reassociate transformation:
  int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
  
@@ -1732,28 +1583,35 @@ from gcc.
  //===---------------------------------------------------------------------===//
  
  Missed instcombine transformation:
  //===---------------------------------------------------------------------===//
  
  Missed instcombine transformation:
-define i32 @a(i32 %x) nounwind readnone {
-entry:
-  %rem = srem i32 %x, 32
-  %shl = shl i32 1, %rem
-  ret i32 %shl
-}
  
  
-The srem can be transformed to an and because if x is negative, the shift is
-undefined. Testcase derived from gcc.
+  %382 = srem i32 %tmp14.i, 64                    ; [#uses=1]
+  %383 = zext i32 %382 to i64                     ; [#uses=1]
+  %384 = shl i64 %381, %383                       ; [#uses=1]
+  %385 = icmp slt i32 %tmp14.i, 64                ; [#uses=1]
+
+The srem can be transformed to an and because if %tmp14.i is negative, the
+shift is undefined.  Testcase derived from 403.gcc.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-Missed instcombine/dagcombine transformation:
-define i32 @a(i32 %x, i32 %y) nounwind readnone {
-entry:
-  %mul = mul i32 %y, -8
-  %sub = sub i32 %x, %mul
-  ret i32 %sub
-}
+This is a range comparison on a divided result (from 403.gcc):
+
+  %1337 = sdiv i32 %1336, 8                       ; [#uses=1]
+  %.off.i208 = add i32 %1336, 7                   ; [#uses=1]
+  %1338 = icmp ult i32 %.off.i208, 15             ; [#uses=1]
+  
+We already catch this (removing the sdiv) if there isn't an add, we should
+handle the 'add' as well.  This is a common idiom with it's builtin_alloca code.
+C testcase:
  
  
-Should compile to something like x+y*8, but currently compiles to an
-inefficient result.  Testcase derived from gcc.
+int a(int x) { return (unsigned)(x/16+7) < 15; }
+
+Another similar case involves truncations on 64-bit targets:
+
+  %361 = sdiv i64 %.046, 8                        ; [#uses=1]
+  %362 = trunc i64 %361 to i32                    ; [#uses=2]
+...
+  %367 = icmp eq i32 %362, 0                      ; [#uses=1]
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -1860,51 +1718,6 @@ case it choses instead to keep the max operation obvious.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-Take the following testcase on x86-64 (similar testcases exist for all targets
-with addc/adde):
-
-define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b,
-i64 %c) nounwind {
-entry:
- %0 = zext i64 %a to i128                        ; <i128> [#uses=1]
- %1 = zext i64 %b to i128                        ; <i128> [#uses=1]
- %2 = add i128 %1, %0                            ; <i128> [#uses=2]
- %3 = zext i64 %c to i128                        ; <i128> [#uses=1]
- %4 = shl i128 %3, 64                            ; <i128> [#uses=1]
- %5 = add i128 %4, %2                            ; <i128> [#uses=1]
- %6 = lshr i128 %5, 64                           ; <i128> [#uses=1]
- %7 = trunc i128 %6 to i64                       ; <i64> [#uses=1]
- store i64 %7, i64* %s, align 8
- %8 = trunc i128 %2 to i64                       ; <i64> [#uses=1]
- store i64 %8, i64* %t, align 8
- ret void
-}
-
-Generated code:
-       addq    %rcx, %rdx
-       movl    $0, %eax
-       adcq    $0, %rax
-       addq    %r8, %rax
-       movq    %rax, (%rdi)
-       movq    %rdx, (%rsi)
-       ret
-
-Expected code:
-       addq    %rcx, %rdx
-       adcq    $0, %r8
-       movq    %r8, (%rdi)
-       movq    %rdx, (%rsi)
-       ret
-
-The generated SelectionDAG has an ADD of an ADDE, where both operands of the
-ADDE are zero. Replacing one of the operands of the ADDE with the other operand
-of the ADD, and replacing the ADD with the ADDE, should give the desired result.
-
-(That said, we are doing a lot better than gcc on this testcase. :) )
-
-//===---------------------------------------------------------------------===//
-
-Switch lowering generates less than ideal code for the following switch:
  define void @a(i32 %x) nounwind {
  entry:
    switch i32 %x, label %if.end [
  define void @a(i32 %x) nounwind {
  entry:
    switch i32 %x, label %if.end [
@@ -1925,19 +1738,15 @@ declare void @foo()
  Generated code on x86-64 (other platforms give similar results):
  a:
         cmpl    $5, %edi
  Generated code on x86-64 (other platforms give similar results):
  a:
         cmpl    $5, %edi
-       ja      .LBB0_2
-       movl    %edi, %eax
-       movl    $47, %ecx
-       btq     %rax, %rcx
-       jb      .LBB0_3
+       ja      LBB2_2
+       cmpl    $4, %edi
+       jne     LBB2_3
  .LBB0_2:
         ret
  .LBB0_3:
         jmp     foo  # TAILCALL
  
  .LBB0_2:
         ret
  .LBB0_3:
         jmp     foo  # TAILCALL
  
-The movl+movl+btq+jb could be simplified to a cmpl+jne.
-
-Or, if we wanted to be really clever, we could simplify the whole thing to
+If we wanted to be really clever, we could simplify the whole thing to
  something like the following, which eliminates a branch:
         xorl    $1, %edi
         cmpl    $4, %edi
  something like the following, which eliminates a branch:
         xorl    $1, %edi
         cmpl    $4, %edi
@@ -1945,42 +1754,563 @@ something like the following, which eliminates a branch:
         ret
  .LBB0_2:
         jmp     foo  # TAILCALL
         ret
  .LBB0_2:
         jmp     foo  # TAILCALL
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+int foo(int a) { return (a & (~15)) / 16; }
+
+Into:
+
+define i32 @foo(i32 %a) nounwind readnone ssp {
+entry:
+  %and = and i32 %a, -16
+  %div = sdiv i32 %and, 16
+  ret i32 %div
+}
+
+but this code (X & -A)/A is X >> log2(A) when A is a power of 2, so this case
+should be instcombined into just "a >> 4".
+
+We do get this at the codegen level, so something knows about it, but 
+instcombine should catch it earlier:
+
+_foo:                                   ## @foo
+## BB#0:                                ## %entry
+       movl    %edi, %eax
+       sarl    $4, %eax
+       ret
+
  //===---------------------------------------------------------------------===//
  //===---------------------------------------------------------------------===//
-Given a branch where the two target blocks are identical ("ret i32 %b" in
-both), simplifycfg will simplify them away. But not so for a switch statement:
  
  
-define i32 @f(i32 %a, i32 %b) nounwind readnone {
+This code (from GCC PR28685):
+
+int test(int a, int b) {
+  int lt = a < b;
+  int eq = a == b;
+  if (lt)
+    return 1;
+  return eq;
+}
+
+Is compiled to:
+
+define i32 @test(i32 %a, i32 %b) nounwind readnone ssp {
  entry:
  entry:
-        switch i32 %a, label %bb3 [
-                i32 4, label %bb
-                i32 6, label %bb
-        ]
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %return, label %if.end
  
  
-bb:             ; preds = %entry, %entry
-        ret i32 %b
+if.end:                                           ; preds = %entry
+  %cmp5 = icmp eq i32 %a, %b
+  %conv6 = zext i1 %cmp5 to i32
+  ret i32 %conv6
  
  
-bb3:            ; preds = %entry
-        ret i32 %b
+return:                                           ; preds = %entry
+  ret i32 1
  }
  }
+
+it could be:
+
+define i32 @test__(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+  %0 = icmp sle i32 %a, %b
+  %retval = zext i1 %0 to i32
+  ret i32 %retval
+}
+
  //===---------------------------------------------------------------------===//
  
  //===---------------------------------------------------------------------===//
  
-clang -O3 fails to devirtualize this virtual inheritance case: (GCC PR45875)
-Looks related to PR3100
+This code can be seen in viterbi:
  
  
-struct c1 {};
-struct c10 : c1{
-  virtual void foo ();
-};
-struct c11 : c10, c1{
-  virtual void f6 ();
+  %64 = call noalias i8* @malloc(i64 %62) nounwind
+...
+  %67 = call i64 @llvm.objectsize.i64(i8* %64, i1 false) nounwind
+  %68 = call i8* @__memset_chk(i8* %64, i32 0, i64 %62, i64 %67) nounwind
+
+llvm.objectsize.i64 should be taught about malloc/calloc, allowing it to
+fold to %62.  This is a security win (overflows of malloc will get caught)
+and also a performance win by exposing more memsets to the optimizer.
+
+This occurs several times in viterbi.
+
+Note that this would change the semantics of @llvm.objectsize which by its
+current definition always folds to a constant. We also should make sure that
+we remove checking in code like
+
+  char *p = malloc(strlen(s)+1);
+  __strcpy_chk(p, s, __builtin_objectsize(p, 0));
+
+//===---------------------------------------------------------------------===//
+
+This code (from Benchmarks/Dhrystone/dry.c):
+
+define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
+entry:
+  %sext = shl i32 %0, 24
+  %conv = ashr i32 %sext, 24
+  %sext6 = shl i32 %1, 24
+  %conv4 = ashr i32 %sext6, 24
+  %cmp = icmp eq i32 %conv, %conv4
+  %. = select i1 %cmp, i32 10000, i32 0
+  ret i32 %.
+}
+
+Should be simplified into something like:
+
+define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
+entry:
+  %sext = shl i32 %0, 24
+  %conv = and i32 %sext, 0xFF000000
+  %sext6 = shl i32 %1, 24
+  %conv4 = and i32 %sext6, 0xFF000000
+  %cmp = icmp eq i32 %conv, %conv4
+  %. = select i1 %cmp, i32 10000, i32 0
+  ret i32 %.
+}
+
+and then to:
+
+define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
+entry:
+  %conv = and i32 %0, 0xFF
+  %conv4 = and i32 %1, 0xFF
+  %cmp = icmp eq i32 %conv, %conv4
+  %. = select i1 %cmp, i32 10000, i32 0
+  ret i32 %.
+}
+//===---------------------------------------------------------------------===//
+
+clang -O3 currently compiles this code
+
+int g(unsigned int a) {
+  unsigned int c[100];
+  c[10] = a;
+  c[11] = a;
+  unsigned int b = c[10] + c[11];
+  if(b > a*2) a = 4;
+  else a = 8;
+  return a + 7;
+}
+
+into
+
+define i32 @g(i32 a) nounwind readnone {
+  %add = shl i32 %a, 1
+  %mul = shl i32 %a, 1
+  %cmp = icmp ugt i32 %add, %mul
+  %a.addr.0 = select i1 %cmp, i32 11, i32 15
+  ret i32 %a.addr.0
+}
+
+The icmp should fold to false. This CSE opportunity is only available
+after GVN and InstCombine have run.
+
+//===---------------------------------------------------------------------===//
+
+memcpyopt should turn this:
+
+define i8* @test10(i32 %x) {
+  %alloc = call noalias i8* @malloc(i32 %x) nounwind
+  call void @llvm.memset.p0i8.i32(i8* %alloc, i8 0, i32 %x, i32 1, i1 false)
+  ret i8* %alloc
+}
+
+into a call to calloc.  We should make sure that we analyze calloc as
+aggressively as malloc though.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 doesn't optimize this:
+
+void f1(int* begin, int* end) {
+  std::fill(begin, end, 0);
+}
+
+into a memset.  This is PR8942.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 -fno-exceptions currently compiles this code:
+
+void f(int N) {
+  std::vector<int> v(N);
+
+  extern void sink(void*); sink(&v);
+}
+
+into
+
+define void @_Z1fi(i32 %N) nounwind {
+entry:
+  %v2 = alloca [3 x i32*], align 8
+  %v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
+  %tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
+  %conv = sext i32 %N to i64
+  store i32* null, i32** %v2.sub, align 8, !tbaa !0
+  %tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
+  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
+  %tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
+  store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
+  %cmp.i.i.i.i = icmp eq i32 %N, 0
+  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i
+
+_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
+  store i32* null, i32** %v2.sub, align 8, !tbaa !0
+  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
+  %add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
+  store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
+  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
+
+cond.true.i.i.i.i:                                ; preds = %entry
+  %cmp.i.i.i.i.i = icmp slt i32 %N, 0
+  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i
+
+if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
+  call void @_ZSt17__throw_bad_allocv() noreturn nounwind
+  unreachable
+
+_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i:    ; preds = %cond.true.i.i.i.i
+  %mul.i.i.i.i.i = shl i64 %conv, 2
+  %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
+  %0 = bitcast i8* %call3.i.i.i.i.i to i32*
+  store i32* %0, i32** %v2.sub, align 8, !tbaa !0
+  store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
+  %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
+  store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
+  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
+  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
+
+This is just the handling the construction of the vector. Most surprising here
+is the fact that all three null stores in %entry are dead (because we do no
+cross-block DSE).
+
+Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
+This is a because the client of LazyValueInfo doesn't simplify all instruction
+operands, just selected ones.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 -fno-exceptions currently compiles this code:
+
+void f(char* a, int n) {
+  __builtin_memset(a, 0, n);
+  for (int i = 0; i < n; ++i)
+    a[i] = 0;
+}
+
+into:
+
+define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
+entry:
+  %conv = sext i32 %n to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %tmp10 = add i32 %n, -1
+  %tmp11 = zext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp11, 1
+  call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
+  ret void
+
+for.end:                                          ; preds = %entry
+  ret void
+}
+
+This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
+the two memset's together.
+
+The issue with the addition only occurs in 64-bit mode, and appears to be at
+least partially caused by Scalar Evolution not keeping its cache updated: it
+returns the "wrong" result immediately after indvars runs, but figures out the
+expected result if it is run from scratch on IR resulting from running indvars.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 -fno-exceptions currently compiles this code:
+
+struct S {
+  unsigned short m1, m2;
+  unsigned char m3, m4;
  };
  };
-struct c28 : virtual c11{
-  void f6 ();
+
+void f(int N) {
+  std::vector<S> v(N);
+  extern void sink(void*); sink(&v);
+}
+
+into poor code for zero-initializing 'v' when N is >0. The problem is that
+S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
+4 stores on each iteration. If the struct were 8 bytes, this gets turned into
+a memset.
+
+In order to handle this we have to:
+  A) Teach clang to generate metadata for memsets of structs that have holes in
+     them.
+  B) Teach clang to use such a memset for zero init of this struct (since it has
+     a hole), instead of doing elementwise zeroing.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 currently compiles this code:
+
+extern const int magic;
+double f() { return 0.0 * magic; }
+
+into
+
+@magic = external constant i32
+
+define double @_Z1fv() nounwind readnone {
+entry:
+  %tmp = load i32* @magic, align 4, !tbaa !0
+  %conv = sitofp i32 %tmp to double
+  %mul = fmul double %conv, 0.000000e+00
+  ret double %mul
+}
+
+We should be able to fold away this fmul to 0.0.  More generally, fmul(x,0.0)
+can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
+not an INF.  The CannotBeNegativeZero predicate in value tracking should be
+extended to support general "fpclassify" operations that can return 
+yes/no/unknown for each of these predicates.
+
+In this predicate, we know that uitofp is trivially never NaN or -0.0, and
+we know that it isn't +/-Inf if the floating point type has enough exponent bits
+to represent the largest integer value as < inf.
+
+//===---------------------------------------------------------------------===//
+
+When optimizing a transformation that can change the sign of 0.0 (such as the
+0.0*val -> 0.0 transformation above), it might be provable that the sign of the
+expression doesn't matter.  For example, by the above rules, we can't transform
+fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
+expression is defined to be -0.0.
+
+If we look at the uses of the fmul for example, we might be able to prove that
+all uses don't care about the sign of zero.  For example, if we have:
+
+  fadd(fmul(sitofp(x), 0.0), 2.0)
+
+Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
+transform the fmul to 0.0, and then the fadd to 2.0.
+
+//===---------------------------------------------------------------------===//
+
+We should enhance memcpy/memcpy/memset to allow a metadata node on them
+indicating that some bytes of the transfer are undefined.  This is useful for
+frontends like clang when lowering struct copies, when some elements of the
+struct are undefined.  Consider something like this:
+
+struct x {
+  char a;
+  int b[4];
  };
  };
-void check_c28 () {
-  c28 obj;
-  c11 *ptr = &obj;
-  ptr->f6 ();
+void foo(struct x*P);
+struct x testfunc() {
+  struct x V1, V2;
+  foo(&V1);
+  V2 = V1;
+
+  return V2;
+}
+
+We currently compile this to:
+$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
+
+
+%struct.x = type { i8, [4 x i32] }
+
+define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
+entry:
+  %V1 = alloca %struct.x, align 4
+  call void @foo(%struct.x* %V1)
+  %tmp1 = bitcast %struct.x* %V1 to i8*
+  %0 = bitcast %struct.x* %V1 to i160*
+  %srcval1 = load i160* %0, align 4
+  %tmp2 = bitcast %struct.x* %agg.result to i8*
+  %1 = bitcast %struct.x* %agg.result to i160*
+  store i160 %srcval1, i160* %1, align 4
+  ret void
  }
  
  }
  
+This happens because SRoA sees that the temp alloca has is being memcpy'd into
+and out of and it has holes and it has to be conservative.  If we knew about the
+holes, then this could be much much better.
+
+Having information about these holes would also improve memcpy (etc) lowering at
+llc time when it gets inlined, because we can use smaller transfers.  This also
+avoids partial register stalls in some important cases.
+
+//===---------------------------------------------------------------------===//
+
+We don't fold (icmp (add) (add)) unless the two adds only have a single use.
+There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for
+example:
+
+ %indvar.next90 = add i64 %indvar89, 1     ;; Has 2 uses
+ %tmp96 = add i64 %tmp95, 1                ;; Has 1 use
+ %exitcond97 = icmp eq i64 %indvar.next90, %tmp96
+
+We don't fold this because we don't want to introduce an overlapped live range
+of the ivar.  However if we can make this more aggressive without causing
+performance issues in two ways:
+
+1. If *either* the LHS or RHS has a single use, we can definitely do the
+   transformation.  In the overlapping liverange case we're trading one register
+   use for one fewer operation, which is a reasonable trade.  Before doing this
+   we should verify that the llc output actually shrinks for some benchmarks.
+2. If both ops have multiple uses, we can still fold it if the operations are
+   both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't
+   increase register pressure.
+
+There are a ton of icmp's we aren't simplifying because of the reg pressure
+concern.  Care is warranted here though because many of these are induction
+variables and other cases that matter a lot to performance, like the above.
+Here's a blob of code that you can drop into the bottom of visitICmp to see some
+missed cases:
+
+  { Value *A, *B, *C, *D;
+    if (match(Op0, m_Add(m_Value(A), m_Value(B))) && 
+        match(Op1, m_Add(m_Value(C), m_Value(D))) &&
+        (A == C || A == D || B == C || B == D)) {
+      errs() << "OP0 = " << *Op0 << "  U=" << Op0->getNumUses() << "\n";
+      errs() << "OP1 = " << *Op1 << "  U=" << Op1->getNumUses() << "\n";
+      errs() << "CMP = " << I << "\n\n";
+    }
+  }
+
+//===---------------------------------------------------------------------===//
+
+define i1 @test1(i32 %x) nounwind {
+  %and = and i32 %x, 3
+  %cmp = icmp ult i32 %and, 2
+  ret i1 %cmp
+}
+
+Can be folded to (x & 2) == 0.
+
+define i1 @test2(i32 %x) nounwind {
+  %and = and i32 %x, 3
+  %cmp = icmp ugt i32 %and, 1
+  ret i1 %cmp
+}
+
+Can be folded to (x & 2) != 0.
+
+SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
+icmp transform.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:1;
+int f4:29;
+} t1;
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:30;
+} t2;
+
+t1 s1;
+t2 s2;
+
+void func1(void)
+{
+s1.f1 = s2.f1;
+s1.f2 = s2.f2;
+}
+
+Compiles into this IR (on x86-64 at least):
+
+%struct.t1 = type { i8, [3 x i8] }
+@s2 = global %struct.t1 zeroinitializer, align 4
+@s1 = global %struct.t1 zeroinitializer, align 4
+define void @func1() nounwind ssp noredzone {
+entry:
+  %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
+  %bf.val.sext5 = and i32 %0, 1
+  %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
+  %2 = and i32 %1, -4
+  %3 = or i32 %2, %bf.val.sext5
+  %bf.val.sext26 = and i32 %0, 2
+  %4 = or i32 %3, %bf.val.sext26
+  store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
+  ret void
+}
+
+The two or/and's should be merged into one each.
+
+//===---------------------------------------------------------------------===//
+
+Machine level code hoisting can be useful in some cases.  For example, PR9408
+is about:
+
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+   f.f1(a);
+ } else {
+   f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo:                                    # @foo
+# BB#0:                                 # %entry
+       pushq   %rbp
+       movq    %rsp, %rbp
+       testl   %esi, %esi
+       movq    %rdi, %rax
+       je      .LBB0_2
+# BB#1:                                 # %if.then
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+.LBB0_2:                                # %if.else
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+
+Note that bb1 and bb2 are the same.  This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+
+//===---------------------------------------------------------------------===//
+
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field).  The rem should be replaced with a multiply and subtract:
+
+  %3 = sdiv i32 %A, %B
+  %4 = srem i32 %A, %B
+
+Similarly for udiv/urem.  Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall).  It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.
+
  //===---------------------------------------------------------------------===//
  //===---------------------------------------------------------------------===//