Cleanup. No functional change intended.

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 5084467657ec8f04dd0a149a05c69d6715bafb8b..b4285a0718799c379eb2a281f159d2c1a42784ed 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,24 +2,6 @@
  // Random ideas for the X86 backend.
  //===---------------------------------------------------------------------===//
  
-Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
-Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
-X86, & make the dag combiner produce it when needed.  This will eliminate one
-imul from the code generated for:
-
-long long test(long long X, long long Y) { return X*Y; }
-
-by using the EAX result from the mul.  We should add a similar node for
-DIVREM.
-
-another case is:
-
-long long test(int X, int Y) { return (long long)X*Y; }
-
-... which should only be one imul instruction.
-
-//===---------------------------------------------------------------------===//
-
  This should be one DIV/IDIV instruction, not a libcall:
  
  unsigned test(unsigned long long X, unsigned Y) {
@@ -45,21 +27,36 @@ http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  
  Another useful one would be  ~0ULL >> X and ~0ULL << X.
  
-//===---------------------------------------------------------------------===//
-
-Compile this:
-_Bool f(_Bool a) { return a!=1; }
-
-into:
-        movzbl  %dil, %eax
-        xorl    $1, %eax
-        ret
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+Also, this might be better.  It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+        movl %ecx, %eax
+        shrl $5, %eax
+        movl %eax, %edx
+        xorl $1, %edx
+        sall %cl, %eax
+        sall %cl. %edx
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
  
  //===---------------------------------------------------------------------===//
  
  Some isel ideas:
  
-1. Dynamic programming based approach when compile time if not an
+1. Dynamic programming based approach when compile time is not an
     issue.
  2. Code duplication (addressing mode) during isel.
  3. Other ideas from "Register-Sensitive Selection, Duplication, and
@@ -77,40 +74,43 @@ Should we promote i16 to i32 to avoid partial register update stalls?
  
  Leave any_extend as pseudo instruction and hint to register
  allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
  
  //===---------------------------------------------------------------------===//
  
-Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
-
-       cmpl $1, %eax
-       setg %al
-       testb %al, %al  # unnecessary
-       jne .BB7
+It appears icc use push for parameter passing. Need to investigate.
  
  //===---------------------------------------------------------------------===//
  
-Count leading zeros and count trailing zeros:
-
-int clz(int X) { return __builtin_clz(X); }
-int ctz(int X) { return __builtin_ctz(X); }
-
-$ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
-clz:
-        bsr     %eax, DWORD PTR [%esp+4]
-        xor     %eax, 31
-        ret
-ctz:
-        bsf     %eax, DWORD PTR [%esp+4]
-        ret
+This:
  
-however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
-aren't.
+void foo(void);
+void bar(int x, int *P) { 
+  x >>= 2;
+  if (x) 
+    foo();
+  *P = x;
+}
  
-//===---------------------------------------------------------------------===//
+compiles into:
  
-Use push/pop instructions in prolog/epilog sequences instead of stores off 
-ESP (certain code size win, perf win on some [which?] processors).
-Also, it appears icc use push for parameter passing. Need to investigate.
+       movq    %rsi, %rbx
+       movl    %edi, %r14d
+       sarl    $2, %r14d
+       testl   %r14d, %r14d
+       je      LBB0_2
+
+Instead of doing an explicit test, we can use the flags off the sar.  This
+occurs in a bigger testcase like this, which is pretty common:
+
+#include <vector>
+int test1(std::vector<int> &X) {
+  int Sum = 0;
+  for (long i = 0, e = X.size(); i != e; ++i)
+    X[i] = 0;
+  return Sum;
+}
  
  //===---------------------------------------------------------------------===//
  
@@ -126,140 +126,6 @@ commutative, it is not matched with the load on both sides.  The dag combiner
  should be made smart enough to cannonicalize the load into the RHS of a compare
  when it can invert the result of the compare for free.
  
-How about intrinsics? An example is:
-  *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
-
-compiles to
-       pmuludq (%eax), %xmm0
-       movl 8(%esp), %eax
-       movdqa (%eax), %xmm1
-       pmulhuw %xmm0, %xmm1
-
-The transformation probably requires a X86 specific pass or a DAG combiner
-target specific hook.
-
-//===---------------------------------------------------------------------===//
-
-When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
-other fast SSE modes.
-
-//===---------------------------------------------------------------------===//
-
-Think about doing i64 math in SSE regs.
-
-//===---------------------------------------------------------------------===//
-
-The DAG Isel doesn't fold the loads into the adds in this testcase.  The
-pattern selector does.  This is because the chain value of the load gets 
-selected first, and the loads aren't checking to see if they are only used by
-and add.
-
-.ll:
-
-int %test(int* %x, int* %y, int* %z) {
-        %X = load int* %x
-        %Y = load int* %y
-        %Z = load int* %z
-        %a = add int %X, %Y
-        %b = add int %a, %Z
-        ret int %b
-}
-
-dag isel:
-
-_test:
-        movl 4(%esp), %eax
-        movl (%eax), %eax
-        movl 8(%esp), %ecx
-        movl (%ecx), %ecx
-        addl %ecx, %eax
-        movl 12(%esp), %ecx
-        movl (%ecx), %ecx
-        addl %ecx, %eax
-        ret
-
-pattern isel:
-
-_test:
-        movl 12(%esp), %ecx
-        movl 4(%esp), %edx
-        movl 8(%esp), %eax
-        movl (%eax), %eax
-        addl (%edx), %eax
-        addl (%ecx), %eax
-        ret
-
-This is bad for register pressure, though the dag isel is producing a 
-better schedule. :)
-
-//===---------------------------------------------------------------------===//
-
-This testcase should have no SSE instructions in it, and only one load from
-a constant pool:
-
-double %test3(bool %B) {
-        %C = select bool %B, double 123.412, double 523.01123123
-        ret double %C
-}
-
-Currently, the select is being lowered, which prevents the dag combiner from
-turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
-
-The pattern isel got this one right.
-
-//===---------------------------------------------------------------------===//
-
-SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
-like this:
-
-  X += y
-
-and the register allocator decides to spill X, it is cheaper to emit this as:
-
-Y += [xslot]
-store Y -> [xslot]
-
-than as:
-
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
-
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time).  *Note* however that this can only be done
-if Y is dead.  Here's a testcase:
-
-%.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
-implementation   ; Functions:
-declare void %printf(int, ...)
-void %main() {
-build_tree.exit:
-        br label %no_exit.i7
-no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
-        %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
-        %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
-        %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
-        %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
-        br bool false, label %Compute_Tree.exit23, label %no_exit.i7
-Compute_Tree.exit23:            ; preds = %no_exit.i7
-        tail call void (int, ...)* %printf( int 0 )
-        store double %tmp.34.i18, double* null
-        ret void
-}
-
-We currently emit:
-
-.BBmain_1:
-        xorpd %XMM1, %XMM1
-        addsd %XMM0, %XMM1
-***     movsd %XMM2, QWORD PTR [%ESP + 8]
-***     addsd %XMM2, %XMM1
-***     movsd QWORD PTR [%ESP + 8], %XMM2
-        jmp .BBmain_1   # no_exit.i7
-
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
-
  //===---------------------------------------------------------------------===//
  
  In many cases, LLVM generates code like this:
@@ -275,7 +141,7 @@ on some processors (which ones?), it is more efficient to do this:
  
  _test:
          movl 8(%esp), %ebx
-       xor %eax, %eax
+        xor  %eax, %eax
          cmpl %ebx, 4(%esp)
          setl %al
          ret
@@ -284,68 +150,6 @@ Doing this correctly is tricky though, as the xor clobbers the flags.
  
  //===---------------------------------------------------------------------===//
  
-We should generate 'test' instead of 'cmp' in various cases, e.g.:
-
-bool %test(int %X) {
-        %Y = shl int %X, ubyte 1
-        %C = seteq int %Y, 0
-        ret bool %C
-}
-bool %test(int %X) {
-        %Y = and int %X, 8
-        %C = seteq int %Y, 0
-        ret bool %C
-}
-
-This may just be a matter of using 'test' to write bigger patterns for X86cmp.
-
-An important case is comparison against zero:
-
-if (X == 0) ...
-
-instead of:
-
-       cmpl $0, %eax
-       je LBB4_2       #cond_next
-
-use:
-       test %eax, %eax
-       jz LBB4_2
-
-which is smaller.
-
-//===---------------------------------------------------------------------===//
-
-SSE should implement 'select_cc' using 'emulated conditional moves' that use
-pcmp/pand/pandn/por to do a selection instead of a conditional branch:
-
-double %X(double %Y, double %Z, double %A, double %B) {
-        %C = setlt double %A, %B
-        %z = add double %Z, 0.0    ;; select operand is not a load
-        %D = select bool %C, double %Y, double %z
-        ret double %D
-}
-
-We currently emit:
-
-_X:
-        subl $12, %esp
-        xorpd %xmm0, %xmm0
-        addsd 24(%esp), %xmm0
-        movsd 32(%esp), %xmm1
-        movsd 16(%esp), %xmm2
-        ucomisd 40(%esp), %xmm1
-        jb LBB_X_2
-LBB_X_1:
-        movsd %xmm0, %xmm2
-LBB_X_2:
-        movsd %xmm2, (%esp)
-        fldl (%esp)
-        addl $12, %esp
-        ret
-
-//===---------------------------------------------------------------------===//
-
  We should generate bts/btr/etc instructions on targets where they are cheap or
  when codesize is important.  e.g., for:
  
@@ -375,18 +179,12 @@ when we can spare a register. It reduces code size.
  
  //===---------------------------------------------------------------------===//
  
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
  Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
  get this:
  
-int %test1(int %X) {
-        %Y = div int %X, 8
-        ret int %Y
+define i32 @test1(i32 %X) {
+    %Y = sdiv i32 %X, 8
+    ret i32 %Y
  }
  
  _test1:
@@ -412,82 +210,6 @@ which is probably slower, but it's interesting at least :)
  
  //===---------------------------------------------------------------------===//
  
-Currently the x86 codegen isn't very good at mixing SSE and FPStack
-code:
-
-unsigned int foo(double x) { return x; }
-
-foo:
-       subl $20, %esp
-       movsd 24(%esp), %xmm0
-       movsd %xmm0, 8(%esp)
-       fldl 8(%esp)
-       fisttpll (%esp)
-       movl (%esp), %eax
-       addl $20, %esp
-       ret
-
-This will be solved when we go to a dynamic programming based isel.
-
-//===---------------------------------------------------------------------===//
-
-Should generate min/max for stuff like:
-
-void minf(float a, float b, float *X) {
-  *X = a <= b ? a : b;
-}
-
-Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
-and ISD::FMAX node types?
-
-//===---------------------------------------------------------------------===//
-
-The first BB of this code:
-
-declare bool %foo()
-int %bar() {
-        %V = call bool %foo()
-        br bool %V, label %T, label %F
-T:
-        ret int 1
-F:
-        call bool %foo()
-        ret int 12
-}
-
-compiles to:
-
-_bar:
-        subl $12, %esp
-        call L_foo$stub
-        xorb $1, %al
-        testb %al, %al
-        jne LBB_bar_2   # F
-
-It would be better to emit "cmp %al, 1" than a xor and test.
-
-//===---------------------------------------------------------------------===//
-
-Enable X86InstrInfo::convertToThreeAddress().
-
-//===---------------------------------------------------------------------===//
-
-Investigate whether it is better to codegen the following
-
-        %tmp.1 = mul int %x, 9
-to
-
-       movl    4(%esp), %eax
-       leal    (%eax,%eax,8), %eax
-
-as opposed to what llc is currently generating:
-
-       imull $9, 4(%esp), %eax
-
-Currently the load folding imull has a higher complexity than the LEA32 pattern.
-
-//===---------------------------------------------------------------------===//
-
  We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
  We should leave these as libcalls for everything over a much lower threshold,
  since libc is hand tuned for medium and large mem ops (avoiding RFO for large
@@ -495,45 +217,6 @@ stores, TLB preheating, etc)
  
  //===---------------------------------------------------------------------===//
  
-Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
-feasible.
-
-//===---------------------------------------------------------------------===//
-
-Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
-the reg-reg copy in this example:
-
-float foo(int *x, float *y, unsigned c) {
-  float res = 0.0;
-  unsigned i;
-  for (i = 0; i < c; i++) {
-    float xx = (float)x[i];
-    xx = xx * y[i];
-    xx += res;
-    res = xx;
-  }
-  return res;
-}
-
-LBB_foo_3:      # no_exit
-        cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
-        mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
-        addss %XMM0, %XMM1
-        inc %ESI
-        cmp %ESI, %ECX
-****    movaps %XMM1, %XMM0
-        jb LBB_foo_3    # no_exit
-
-//===---------------------------------------------------------------------===//
-
-Codegen:
-  if (copysign(1.0, x) == copysign(1.0, y))
-into:
-  if (x^y & mask)
-when using SSE.
-
-//===---------------------------------------------------------------------===//
-
  Optimize this into something reasonable:
   x * copysign(1.0, y) * copysign(1.0, z)
  
@@ -543,107 +226,12 @@ Optimize copysign(x, *y) to use an integer load from y.
  
  //===---------------------------------------------------------------------===//
  
-%X = weak global int 0
-
-void %foo(int %N) {
-       %N = cast int %N to uint
-       %tmp.24 = setgt int %N, 0
-       br bool %tmp.24, label %no_exit, label %return
-
-no_exit:
-       %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
-       %i.0.0 = cast uint %indvar to int
-       volatile store int %i.0.0, int* %X
-       %indvar.next = add uint %indvar, 1
-       %exitcond = seteq uint %indvar.next, %N
-       br bool %exitcond, label %return, label %no_exit
-
-return:
-       ret void
-}
-
-compiles into:
-
-       .text
-       .align  4
-       .globl  _foo
-_foo:
-       movl 4(%esp), %eax
-       cmpl $1, %eax
-       jl LBB_foo_4    # return
-LBB_foo_1:     # no_exit.preheader
-       xorl %ecx, %ecx
-LBB_foo_2:     # no_exit
-       movl L_X$non_lazy_ptr, %edx
-       movl %ecx, (%edx)
-       incl %ecx
-       cmpl %eax, %ecx
-       jne LBB_foo_2   # no_exit
-LBB_foo_3:     # return.loopexit
-LBB_foo_4:     # return
-       ret
-
-We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
-remateralization is implemented. This can be accomplished with 1) a target
-dependent LICM pass or 2) makeing SelectDAG represent the whole function. 
-
-//===---------------------------------------------------------------------===//
-
  The following tests perform worse with LSR:
  
  lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
  
  //===---------------------------------------------------------------------===//
  
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
-
-//===---------------------------------------------------------------------===//
-
-mov $reg, 48(%esp)
-...
-leal 48(%esp), %eax
-mov %eax, (%esp)
-call _foo
-
-Obviously it would have been better for the first mov (or any op) to store
-directly %esp[0] if there are no other uses.
-
-//===---------------------------------------------------------------------===//
-
-Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
-of a v4sf value.
-
-//===---------------------------------------------------------------------===//
-
-Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
-Perhaps use pxor / xorp* to clear a XMM register first?
-
-//===---------------------------------------------------------------------===//
-
-Better codegen for:
-
-void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
-void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
-
-For the later we generate:
-
-_f:
-        pxor %xmm0, %xmm0
-        movss 8(%esp), %xmm1
-        movaps %xmm0, %xmm2
-        unpcklps %xmm1, %xmm2
-        movss 4(%esp), %xmm1
-        unpcklps %xmm0, %xmm1
-        unpcklps %xmm2, %xmm1
-        movl 12(%esp), %eax
-        movaps %xmm1, (%eax)
-        ret
-
-This seems like it should use shufps, one for each of a & b.
-
-//===---------------------------------------------------------------------===//
-
  Adding to the list of cmp / test poor codegen issues:
  
  int test(__m128 *A, __m128 *B) {
@@ -676,448 +264,1780 @@ We probably need some kind of target DAG combine hook to fix this.
  
  //===---------------------------------------------------------------------===//
  
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
-       movaps LCPI5_5, %xmm2
-       divps %xmm1, %xmm2
-       mulps %xmm2, %xmm3
-       mulps 8656(%ecx), %xmm3
-       addps 8672(%ecx), %xmm3
-       andps LCPI5_6, %xmm2
-       andps LCPI5_1, %xmm3
-       por %xmm2, %xmm3
-       movdqa %xmm3, (%edi)
-
-       movaps LCPI5_5, %xmm1
-       divps %xmm0, %xmm1
-       mulps %xmm1, %xmm3
-       mulps 8656(%ecx), %xmm3
-       addps 8672(%ecx), %xmm3
-       andps LCPI5_6, %xmm1
-       andps LCPI5_1, %xmm3
-       orps %xmm1, %xmm3
-       movaps %xmm3, 112(%esp)
-       movaps %xmm3, (%ebx)
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
  
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
+There is also one case we do worse on PPC.
  
  //===---------------------------------------------------------------------===//
  
-Use movddup to splat a v2f64 directly from a memory source. e.g.
-
-#include <emmintrin.h>
+For this:
  
-void test(__m128d *r, double A) {
-  *r = _mm_set1_pd(A);
+int test(int a)
+{
+  return a * 3;
  }
  
-llc:
-
-_test:
-       movsd 8(%esp), %xmm0
-       unpcklpd %xmm0, %xmm0
-       movl 4(%esp), %eax
-       movapd %xmm0, (%eax)
-       ret
+We currently emits
+       imull $3, 4(%esp), %eax
  
-icc:
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+       movl    4(%esp), %eax
+       leal    (%eax,%eax,2), %eax
  
-_test:
-       movl 4(%esp), %eax
-       movddup 8(%esp), %xmm0
-       movapd %xmm0, (%eax)
-       ret
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
  
-//===---------------------------------------------------------------------===//
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
  
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
  
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
  
  //===---------------------------------------------------------------------===//
  
-We are emitting bad code for this:
+__builtin_ffs codegen is messy.
  
-float %test(float* %V, int %I, int %D, float %V) {
-entry:
-       %tmp = seteq int %D, 0
-       br bool %tmp, label %cond_true, label %cond_false23
-
-cond_true:
-       %tmp3 = getelementptr float* %V, int %I
-       %tmp = load float* %tmp3
-       %tmp5 = setgt float %tmp, %V
-       %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
-       %tmp7 = or bool %tmp5, %tmp6
-       br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
-
-cond_next:
-       %tmp10 = add int %I, 1
-       %tmp12 = getelementptr float* %V, int %tmp10
-       %tmp13 = load float* %tmp12
-       %tmp15 = setle float %tmp13, %V
-       %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
-       %tmp17 = or bool %tmp15, %tmp16
-       %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
-       ret float %retval
-
-cond_false23:
-       %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
-       ret float %tmp28
-
-UnifiedReturnBlock:            ; preds = %cond_true
-       ret float 0.000000e+00
-}
+int ffs_(unsigned X) { return __builtin_ffs(X); }
+
+llvm produces:
+ffs_:
+        movl    4(%esp), %ecx
+        bsfl    %ecx, %eax
+        movl    $32, %edx
+        cmove   %edx, %eax
+        incl    %eax
+        xorl    %edx, %edx
+        testl   %ecx, %ecx
+        cmove   %edx, %eax
+        ret
  
-declare bool %llvm.isunordered.f32(float, float)
+vs gcc:
  
-declare float %foo(float*, int, int, float)
+_ffs_:
+        movl    $-1, %edx
+        bsfl    4(%esp), %eax
+        cmove   %edx, %eax
+        addl    $1, %eax
+        ret
  
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
  
-It exposes a known load folding problem:
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
  
-       movss (%edx,%ecx,4), %xmm1
-       ucomiss %xmm1, %xmm0
+//===---------------------------------------------------------------------===//
  
-As well as this:
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
  
-LBB_test_2:    # cond_next
-       movss LCPI1_0, %xmm2
-       pxor %xmm3, %xmm3
-       ucomiss %xmm0, %xmm1
-       jbe LBB_test_6  # cond_next
-LBB_test_5:    # cond_next
-       movaps %xmm2, %xmm3
-LBB_test_6:    # cond_next
-       movss %xmm3, 40(%esp)
-       flds 40(%esp)
-       addl $44, %esp
-       ret
+//===---------------------------------------------------------------------===//
  
-Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
-three moves (movss, movaps, movss).
-
-//===---------------------------------------------------------------------===//
-
-External test Nurbs exposed some problems. Look for
-__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
-emits:
-
-        movaps    (%edx), %xmm2                                 #59.21
-        movaps    (%edx), %xmm5                                 #60.21
-        movaps    (%edx), %xmm4                                 #61.21
-        movaps    (%edx), %xmm3                                 #62.21
-        movl      40(%ecx), %ebp                                #69.49
-        shufps    $0, %xmm2, %xmm5                              #60.21
-        movl      100(%esp), %ebx                               #69.20
-        movl      (%ebx), %edi                                  #69.20
-        imull     %ebp, %edi                                    #69.49
-        addl      (%eax), %edi                                  #70.33
-        shufps    $85, %xmm2, %xmm4                             #61.21
-        shufps    $170, %xmm2, %xmm3                            #62.21
-        shufps    $255, %xmm2, %xmm2                            #63.21
-        lea       (%ebp,%ebp,2), %ebx                           #69.49
-        negl      %ebx                                          #69.49
-        lea       -3(%edi,%ebx), %ebx                           #70.33
-        shll      $4, %ebx                                      #68.37
-        addl      32(%ecx), %ebx                                #68.37
-        testb     $15, %bl                                      #91.13
-        jne       L_B1.24       # Prob 5%                       #91.13
-
-This is the llvm code after instruction scheduling:
-
-cond_next140 (0xa910740, LLVM BB @0xa90beb0):
-       %reg1078 = MOV32ri -3
-       %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
-       %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
-       %reg1080 = IMUL32rr %reg1079, %reg1037
-       %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
-       %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
-       %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
-       %reg1082 = SHL32ri %reg1038, 4
-       %reg1039 = ADD32rr %reg1036, %reg1082
-       %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
-       %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
-       %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
-       %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
-       %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
-       %reg1040 = MOV32rr %reg1039
-       %reg1084 = AND32ri8 %reg1039, 15
-       CMP32ri8 %reg1084, 0
-       JE mbb<cond_next204,0xa914d30>
-
-Still ok. After register allocation:
-
-cond_next140 (0xa910740, LLVM BB @0xa90beb0):
-       %EAX = MOV32ri -3
-       %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
-       ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
-       %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
-       %EDX = MOV32rm %EDX, 1, %NOREG, 40
-       IMUL32rr %EAX<def&use>, %EDX
-       %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
-       %ESI = MOV32rm %ESI, 1, %NOREG, 0
-       MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
-       %EAX = LEA32r %ESI, 1, %EAX, -3
-       %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
-       %ESI = MOV32rm %ESI, 1, %NOREG, 32
-       %EDI = MOV32rr %EAX
-       SHL32ri %EDI<def&use>, 4
-       ADD32rr %EDI<def&use>, %ESI
-       %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
-       %XMM1 = MOVAPSrr %XMM0
-       SHUFPSrr %XMM1<def&use>, %XMM1, 170
-       %XMM2 = MOVAPSrr %XMM0
-       SHUFPSrr %XMM2<def&use>, %XMM2, 0
-       %XMM3 = MOVAPSrr %XMM0
-       SHUFPSrr %XMM3<def&use>, %XMM3, 255
-       SHUFPSrr %XMM0<def&use>, %XMM0, 85
-       %EBX = MOV32rr %EDI
-       AND32ri8 %EBX<def&use>, 15
-       CMP32ri8 %EBX, 0
-       JE mbb<cond_next204,0xa914d30>
-
-This looks really bad. The problem is shufps is a destructive opcode. Since it
-appears as operand two in more than one shufps ops. It resulted in a number of
-copies. Note icc also suffers from the same problem. Either the instruction
-selector should select pshufd or The register allocator can made the two-address
-to three-address transformation.
-
-It also exposes some other problems. See MOV32ri -3 and the spills.
-
-//===---------------------------------------------------------------------===//
-
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4:    # cond_true44
-       addps %xmm1, %xmm2
-       subps %xmm3, %xmm2
-       movaps (%ecx), %xmm4
-       movaps %xmm2, %xmm1
-       addps %xmm4, %xmm1
-       addl $16, %ecx
-       incl %edx
-       cmpl $262144, %edx
-       movaps %xmm3, %xmm2
-       movaps %xmm4, %xmm3
-       jne LBB_main_4  # cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
-       addps %xmm2, %xmm1    <=== Commute!
-       subps %xmm3, %xmm1
-       movaps (%ecx), %xmm4
-       movaps %xmm1, %xmm1   <=== Eliminate!
-       addps %xmm4, %xmm1
-       addl $16, %ecx
-       incl %edx
-       cmpl $262144, %edx
-       movaps %xmm3, %xmm2
-       movaps %xmm4, %xmm3
-       jne LBB_main_4  # cond_true44
-
-//===---------------------------------------------------------------------===//
-
-Consider:
-
-__m128 test(float a) {
-  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+define i32 @foo(i32* %a, i32 %t) {
+entry:
+       br label %cond_true
+
+cond_true:             ; preds = %cond_true, %entry
+       %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
+       %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
+       %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
+       %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
+       %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
+       %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
+       %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
+       %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
+       br i1 %tmp, label %bb12, label %cond_true
+
+bb12:          ; preds = %cond_true
+       ret i32 %tmp7
  }
+is pessimized by -loop-reduce and -indvars
  
-This compiles into:
+//===---------------------------------------------------------------------===//
  
-movss 4(%esp), %xmm1
-mulss %xmm1, %xmm1
-xorps %xmm0, %xmm0
-movss %xmm1, %xmm0
-ret
+u32 to float conversion improvement:
  
-Because mulss doesn't modify the top 3 elements, the top elements of 
-xmm1 are already zero'd.  We could compile this to:
+float uint32_2_float( unsigned u ) {
+  float fl = (int) (u & 0xffff);
+  float fh = (int) (u >> 16);
+  fh *= 0x1.0p16f;
+  return fh + fl;
+}
  
-movss 4(%esp), %xmm0
-mulss %xmm0, %xmm0
-ret
+00000000        subl    $0x04,%esp
+00000003        movl    0x08(%esp,1),%eax
+00000007        movl    %eax,%ecx
+00000009        shrl    $0x10,%ecx
+0000000c        cvtsi2ss        %ecx,%xmm0
+00000010        andl    $0x0000ffff,%eax
+00000015        cvtsi2ss        %eax,%xmm1
+00000019        mulss   0x00000078,%xmm0
+00000021        addss   %xmm1,%xmm0
+00000025        movss   %xmm0,(%esp,1)
+0000002a        flds    (%esp,1)
+0000002d        addl    $0x04,%esp
+00000030        ret
  
  //===---------------------------------------------------------------------===//
  
-Here's a sick and twisted idea.  Consider code like this:
-
-__m128 test(__m128 a) {
-  float b = *(float*)&A;
-  ...
-  return _mm_set_ps(0.0, 0.0, 0.0, b);
-}
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
  
-This might compile to this code:
+//===---------------------------------------------------------------------===//
  
-movaps c(%esp), %xmm1
-xorps %xmm0, %xmm0
-movss %xmm1, %xmm0
-ret
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".
  
-Now consider if the ... code caused xmm1 to get spilled.  This might produce
-this code:
+//===---------------------------------------------------------------------===//
  
-movaps c(%esp), %xmm1
-movaps %xmm1, c2(%esp)
-...
+Consider the expansion of:
  
-xorps %xmm0, %xmm0
-movaps c2(%esp), %xmm1
-movss %xmm1, %xmm0
-ret
+define i32 @test3(i32 %X) {
+        %tmp1 = urem i32 %X, 255
+        ret i32 %tmp1
+}
  
-However, since the reload is only used by these instructions, we could 
-"fold" it into the uses, producing something like this:
+Currently it compiles to:
  
-movaps c(%esp), %xmm1
-movaps %xmm1, c2(%esp)
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
  ...
  
-movss c2(%esp), %xmm0
-ret
+This could be "reassociated" into:
  
-... saving two instructions.
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
  
-The basic idea is that a reload from a spill slot, can, if only one 4-byte 
-chunk is used, bring in 3 zeros the the one element instead of 4 elements.
-This can be used to simplify a variety of shuffle operations, where the
-elements are fixed zeros.
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
  
  //===---------------------------------------------------------------------===//
  
-We generate significantly worse code for this than GCC:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
-http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
  
-There is also one case we do worse on PPC.
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary.  In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
+937  937 0x3d0a incl     %esi
+3    3   0x3d0b cmpb     %bl, %dl
+27   27  0x3d0d jnz      0x000062db <main+11707>
  
  //===---------------------------------------------------------------------===//
  
-For this:
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
  
-#include <emmintrin.h>
-void test(__m128d *r, __m128d *A, double B) {
-  *r = _mm_loadl_pd(*A, &B);
-}
+//===---------------------------------------------------------------------===//
  
-We generates:
-
-       subl $12, %esp
-       movsd 24(%esp), %xmm0
-       movsd %xmm0, (%esp)
-       movl 20(%esp), %eax
-       movapd (%eax), %xmm0
-       movlpd (%esp), %xmm0
-       movl 16(%esp), %eax
-       movapd %xmm0, (%eax)
-       addl $12, %esp
-       ret
+This could be a single 16-bit load.
  
-icc generates:
+int f(char *p) {
+    if ((p[0] == 1) & (p[1] == 2)) return 1;
+    return 0;
+}
  
-        movl      4(%esp), %edx                                 #3.6
-        movl      8(%esp), %eax                                 #3.6
-        movapd    (%eax), %xmm0                                 #4.22
-        movlpd    12(%esp), %xmm0                               #4.8
-        movapd    %xmm0, (%edx)                                 #4.3
-        ret                                                     #5.1
+//===---------------------------------------------------------------------===//
  
-So icc is smart enough to know that B is in memory so it doesn't load it and
-store it back to stack.
+We should inline lrintf and probably other libc functions.
  
  //===---------------------------------------------------------------------===//
  
-__m128d test1( __m128d A, __m128d B) {
-  return _mm_shuffle_pd(A, B, 0x3);
+Use the FLAGS values from arithmetic instructions more.  For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x += y) == 0)
+          return a;
+     else
+          return b;
  }
  
-compiles to
+to:
+       addl    %esi, (%rdi)
+       movl    %edx, %eax
+       cmovne  %ecx, %eax
+       ret
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        movl %esi, (%rdi)
+        testl %esi, %esi
+        cmove %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
+without a test instruction.
  
-shufpd $3, %xmm1, %xmm0
+//===---------------------------------------------------------------------===//
  
-Perhaps it's better to use unpckhpd instead?
+These two functions have identical effects:
  
-unpckhpd %xmm1, %xmm0
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
  
-Don't know if unpckhpd is faster. But it is shorter.
+We currently compile them to:
  
-//===---------------------------------------------------------------------===//
+_f:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        movl 8(%esp), %edx
+        cmpl %edx, %ecx
+        jne LBB1_2      #UnifiedReturnBlock
+LBB1_1: #cond_true
+        addl $2, %eax
+        ret
+LBB1_2: #UnifiedReturnBlock
+        movl %ecx, %eax
+        ret
+_f2:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        cmpl 8(%esp), %ecx
+        sete %cl
+        movzbl %cl, %ecx
+        leal 1(%ecx,%eax), %eax
+        ret
  
-If shorter, we should use things like:
-movzwl %ax, %eax
-instead of:
-andl $65535, %EAX
+both of which are inferior to GCC's:
  
-The former can also be used when the two-addressy nature of the 'and' would
-require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
+_f:
+        movl    4(%esp), %edx
+        leal    1(%edx), %eax
+        addl    $2, %edx
+        cmpl    8(%esp), %eax
+        cmove   %edx, %eax
+        ret
+_f2:
+        movl    4(%esp), %eax
+        addl    $1, %eax
+        xorl    %edx, %edx
+        cmpl    8(%esp), %eax
+        sete    %dl
+        addl    %edx, %eax
+        ret
  
  //===---------------------------------------------------------------------===//
  
-This code generates ugly code, probably due to costs being off or something:
+This code:
  
-void %test(float* %P, <4 x float>* %P2 ) {
-        %xFloat0.688 = load float* %P
-        %loadVector37.712 = load <4 x float>* %P2
-        %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
-        store <4 x float> %inFloat3.713, <4 x float>* %P2
-        ret void
+void test(int X) {
+  if (X) abort();
  }
  
-Generates:
+is currently compiled to:
  
  _test:
-        pxor %xmm0, %xmm0
-        movd %xmm0, %eax        ;; EAX = 0!
-        movl 8(%esp), %ecx
-        movaps (%ecx), %xmm0
-        pinsrw $6, %eax, %xmm0
-        shrl $16, %eax          ;; EAX = 0 again!
-        pinsrw $7, %eax, %xmm0
-        movaps %xmm0, (%ecx)
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne LBB1_1
+        addl $12, %esp
          ret
+LBB1_1:
+        call L_abort$stub
  
-It would be better to generate:
+It would be better to produce:
  
  _test:
-        movl 8(%esp), %ecx
-        movaps (%ecx), %xmm0
-       xor %eax, %eax
-        pinsrw $6, %eax, %xmm0
-        pinsrw $7, %eax, %xmm0
-        movaps %xmm0, (%ecx)
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne L_abort$stub
+        addl $12, %esp
+        ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
          ret
+LBB1_1:
+        subl $12, %esp
+        call L_abort$stub
  
-or use pxor (to make a zero vector) and shuffle (to insert it).
+Both are useful in different situations.  Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        pop %eax   # realign stack.
+        call L_abort$stub
+
+Though this probably isn't worth it.
  
  //===---------------------------------------------------------------------===//
  
-Bad codegen:
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+       neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
  
-char foo(int x) { return x; }
+Leaf functions that require one 4-byte spill slot have a prolog like this:
  
  _foo:
-       movl 4(%esp), %eax
-       shll $24, %eax
-       sarl $24, %eax
+        pushl   %esi
+        subl    $4, %esp
+...
+and an epilog like this:
+        addl    $4, %esp
+        popl    %esi
+        ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
+branches.  We generate really poor code for:
+
+double testf(double a) {
+       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+        subl    $20, %esp
+        pxor    %xmm0, %xmm0
+        movsd   24(%esp), %xmm1
+        ucomisd %xmm0, %xmm1
+        setnp   %al
+        sete    %cl
+        testb   %cl, %al
+        jne     LBB1_5  # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+       jp LBB1_1
+       je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+       cvtss2sd        LCPI1_0(%rip), %xmm2
+        cvtss2sd        LCPI1_1(%rip), %xmm3
+        ucomisd %xmm1, %xmm0
+        ja      LBB1_3  # cond_true
+LBB1_2: # cond_true
+        movapd  %xmm3, %xmm2
+LBB1_3: # cond_true
+        movapd  %xmm2, %xmm0
+        ret
+
+We should sink the load into xmm3 into the LBB1_2 block.  This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+       addl    %esi, %edi
+       setae   %al
+       movzbl  %al, %eax
+       ret
+
+on x86-64, instead of the rather stupid-looking:
+       addl    %esi, %edi
+       setb    %al
+       xorb    $1, %al
+       movzbl  %al, %eax
+       ret
+
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader:               ; preds = %cond_next94
+       %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
+       %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
+       %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
+       %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
+       %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
+       %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
+       %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
+       %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
+       %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
+       %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
+       %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
+       br label %bb114
+
+produces:
+
+LBB3_5:        # bb114.preheader
+       movswl  -68(%ebp), %eax
+       movl    $32, %ecx
+       movl    %ecx, -80(%ebp)
+       subl    %eax, -80(%ebp)
+       movswl  -52(%ebp), %eax
+       movl    %ecx, -84(%ebp)
+       subl    %eax, -84(%ebp)
+       movswl  -70(%ebp), %eax
+       movl    %ecx, -88(%ebp)
+       subl    %eax, -88(%ebp)
+       movswl  -50(%ebp), %eax
+       subl    %eax, %ecx
+       movl    %ecx, -76(%ebp)
+       movswl  -42(%ebp), %eax
+       movl    %eax, -92(%ebp)
+       movswl  -66(%ebp), %eax
+       movl    %eax, -96(%ebp)
+       movw    $0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack 
+slot into the movl.  The above instructions could be:
+       movl    $32, -80(%ebp)
+...
+       movl    $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+       %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
+       br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+       testw   %cx, %cx
+       movswl  %cx, %esi
+       jns     LBB4_109        # cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+  if (foo < 4294967297LL)
+    abort();
+}
+
+to:
+
+compare:
+        subl    $4, %esp
+        cmpl    $0, 8(%esp)
+        setne   %al
+        movzbw  %al, %ax
+        cmpl    $1, 12(%esp)
+        setg    %cl
+        movzbw  %cl, %cx
+        cmove   %ax, %cx
+        testb   $1, %cl
+        jne     .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        call    abort
+.LBB1_2:        # UnifiedReturnBlock
+        addl    $4, %esp
+        ret
+
+(also really horrible code on ppc).  This is due to the expand code for 64-bit
+compares.  GCC produces multiple branches, which is much nicer:
+
+compare:
+        subl    $12, %esp
+        movl    20(%esp), %edx
+        movl    16(%esp), %eax
+        decl    %edx
+        jle     .L7
+.L5:
+        addl    $12, %esp
+        ret
+        .p2align 4,,7
+.L7:
+        jl      .L4
+        cmpl    $0, %eax
+        .p2align 4,,8
+        ja      .L5
+.L4:
+        .p2align 4,,9
+        call    abort
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or  that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:  
+
+int callee(int32, int64); 
+int caller(int32 arg1, int32 arg2) { 
+  int64 local = arg2 * 2; 
+  return callee(arg2, (int64)local); 
+}
+
+[arg1]          [!arg2 no longer valid since we moved local onto it]
+[arg2]      ->  [(int64)
+[RETADDR]        local  ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+   overwrite a caller parameter which is used by the callee and only
+   push them onto the top of the stack.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg1,arg2);
+   }
+
+   Here we don't need to write any variables to the top of the stack
+   since they don't overwrite each other.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg2,arg1);
+   }
+
+   Here we need to push the arguments because they overwrite each
+   other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+  int i = 0;
+  unsigned long int z = 0;
+
+  do {
+    z -= 0x00004000;
+    i++;
+    if (i > 0x00040000)
+      abort ();
+  } while (z > 0);
+  exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+       subl    $28, %esp
+       xorl    %eax, %eax
+       jmp     L2
+L3:
+       cmpl    $262144, %eax
+       je      L10
+L2:
+       addl    $1, %eax
+       cmpl    $262145, %eax
+       jne     L3
+       call    L_abort$stub
+L10:
+       movl    $0, (%esp)
+       call    L_exit$stub
+
+llvm:
+
+_main:
+       subl    $12, %esp
+       movl    $1, %eax
+       movl    $16384, %ecx
+LBB1_1:        # bb
+       cmpl    $262145, %eax
+       jge     LBB1_4  # cond_true
+LBB1_2:        # cond_next
+       incl    %eax
+       addl    $4294950912, %ecx
+       cmpl    $16384, %ecx
+       jne     LBB1_1  # bb
+LBB1_3:        # bb11
+       xorl    %eax, %eax
+       addl    $12, %esp
+       ret
+LBB1_4:        # cond_true
+       call    L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+        leal    1(%eax), %edx
+        cmpl    $262145, %edx
+   =>
+        cmpl    $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+       %Y = fptosi double %X to i64
+       ret i64 %Y
+}
+
+compiles to:
+
+_test:
+       subl    $20, %esp
+       movsd   24(%esp), %xmm0
+       movsd   %xmm0, 8(%esp)
+       fldl    8(%esp)
+       fisttpll        (%esp)
+       movl    4(%esp), %edx
+       movl    (%esp), %eax
+       addl    $20, %esp
+       #FP_REG_KILL
+       ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+        movzwl  4(%esp), %eax
+        orl     $255, %eax
+        ret
+
+instead of:
+_foo:
+       movl    $65280, %eax
+       andl    4(%esp), %eax
+       orl     $255, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+We're codegen'ing multiply of long longs inefficiently:
+
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+  return arg1 *  arg2;
+}
+
+We compile to (fomit-frame-pointer):
+
+_LLM:
+       pushl   %esi
+       movl    8(%esp), %ecx
+       movl    16(%esp), %esi
+       movl    %esi, %eax
+       mull    %ecx
+       imull   12(%esp), %esi
+       addl    %edx, %esi
+       imull   20(%esp), %ecx
+       movl    %esi, %edx
+       addl    %ecx, %edx
+       popl    %esi
+       ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area.  ICC apparently produces:
+
+        movl      8(%esp), %ecx
+        imull     12(%esp), %ecx
+        movl      16(%esp), %eax
+        imull     4(%esp), %eax 
+        addl      %eax, %ecx  
+        movl      4(%esp), %eax
+        mull      12(%esp) 
+        addl      %ecx, %edx
+        ret
+
+Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg".  Instead of:
+
+xorl    %eax, %eax
+movl    %eax, 124(%esp)
+
+we should get:
+
+movl    $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+       xorpd   LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+       movsd   (%esp), %xmm0
+       xorpd   LCPI1_0, %xmm0
+       movsd   %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+       xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall.  Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+  double X = *P;
+  a = X;
+  bar();
+  X = -X;
+  b = X;
+  bar();
+  c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+  long long prod = (long long)a*b;
+  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+        movl    4(%esp), %eax
+        cltd
+        xorl    %edx, %eax
+        subl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from 
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+  if (arg1 >> 48)
+    return (first_one[arg1 >> 48]);
+  return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+        movl    8(%esp), %eax
+        cmpl    $65536, %eax
+        movl    4(%esp), %ecx
+        jb      .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        shrl    $16, %eax
+        movzbl  first_one(%eax), %eax
+        ret
+.LBB1_2:        # UnifiedReturnBlock
+        xorl    %eax, %eax
+        ret
+
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
+entry:
+       %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
+       br i1 %tmp2, label %bb7, label %bb
+
+bb:            ; preds = %entry
+       %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
+       ret i32 %tmp6
+
+bb7:           ; preds = %entry
+       %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
+       ret i32 %tmp10
+}
+
+to:
+
+foo:                                    # @foo
+# BB#0:                                 # %entry
+       movl    4(%esp), %ecx
+       cmpb    $0, 16(%esp)
+       je      .LBB0_2
+# BB#1:                                 # %bb
+       movl    8(%esp), %eax
+       addl    %ecx, %eax
+       ret
+.LBB0_2:                                # %bb7
+       movl    12(%esp), %edx
+       movl    %ecx, %eax
+       subl    %edx, %eax
+       ret
+
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15:        # bb310
+        cvtss2sd        LCPI1_0, %xmm1
+        addsd   %xmm1, %xmm0
+        movsd   176(%esp), %xmm2
+        mulsd   %xmm0, %xmm2
+        movapd  %xmm2, %xmm3
+        mulsd   %xmm3, %xmm3
+        movapd  %xmm3, %xmm4
+        mulsd   LCPI1_23, %xmm4
+        addsd   LCPI1_24, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_25, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_26, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_27, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_28, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   %xmm1, %xmm4
+        mulsd   %xmm2, %xmm4
+        movsd   152(%esp), %xmm1
+        addsd   %xmm4, %xmm1
+        movsd   %xmm1, 152(%esp)
+        incl    %eax
+        cmpl    %eax, %esi
+        jge     LBB1_15 # bb310
+LBB1_16:        # bb358.loopexit
+        movsd   152(%esp), %xmm0
+        addsd   %xmm0, %xmm0
+        addsd   LCPI1_22, %xmm0
+        movsd   %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Take the following:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false             ; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind  {
+entry:
+       %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
+       br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i:          ; preds = %entry
+       tail call void @exit( i32 1 ) noreturn nounwind 
+       unreachable
+bb4.i:         ; preds = %entry
+       store i1 true, i1* @in_exit.4870.b
+       tail call void @exit( i32 1 ) noreturn nounwind 
+       unreachable
+}
+declare void @exit(i32) noreturn nounwind 
+
+This compiles into:
+_abort_gzip:                            ## @abort_gzip
+## BB#0:                                ## %entry
+       subl    $12, %esp
+       movb    _in_exit.4870.b, %al
+       cmpb    $1, %al
+       jne     LBB0_2
+
+We somehow miss folding the movb into the cmpb.
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+  return x-y-1;
+}
+
+into (-m64):
+
+_test:
+       decl    %edi
+       movl    %edi, %eax
+       subl    %esi, %eax
         ret
  
+it would be better to codegen as: x+~y  (notl+addl)
+
  //===---------------------------------------------------------------------===//
  
-Some useful information in the Apple Altivec / SSE Migration Guide:
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+       subq    $200, %rsp
+        movaps  %xmm7, 160(%rsp)
+        movaps  %xmm6, 144(%rsp)
+        movaps  %xmm5, 128(%rsp)
+        movaps  %xmm4, 112(%rsp)
+        movaps  %xmm3, 96(%rsp)
+        movaps  %xmm2, 80(%rsp)
+        movaps  %xmm1, 64(%rsp)
+        movaps  %xmm0, 48(%rsp)
+        movq    %r9, 40(%rsp)
+        movq    %r8, 32(%rsp)
+        movq    %rcx, 24(%rsp)
+        movq    %rdx, 16(%rsp)
+        movq    %rsi, 8(%rsp)
+        leaq    (%rsp), %rax
+        movq    %rax, 192(%rsp)
+        leaq    208(%rsp), %rax
+        movq    %rax, 184(%rsp)
+        movl    $48, 180(%rsp)
+        movl    $8, 176(%rsp)
+        movl    176(%rsp), %eax
+        cmpl    $47, %eax
+        jbe     .LBB1_3 # bb
+.LBB1_1:        # bb3
+        movq    184(%rsp), %rcx
+        leaq    8(%rcx), %rax
+        movq    %rax, 184(%rsp)
+.LBB1_2:        # bb4
+        movl    (%rcx), %eax
+        addq    $200, %rsp
+        ret
+.LBB1_3:        # bb
+        movl    %eax, %ecx
+        addl    $8, %eax
+        addq    192(%rsp), %rcx
+        movl    %eax, 176(%rsp)
+        jmp     .LBB1_2 # bb4
+
+gcc 4.3 generates:
+       subq    $96, %rsp
+.LCFI0:
+        leaq    104(%rsp), %rax
+        movq    %rsi, -80(%rsp)
+        movl    $8, -120(%rsp)
+        movq    %rax, -112(%rsp)
+        leaq    -88(%rsp), %rax
+        movq    %rax, -104(%rsp)
+        movl    $8, %eax
+        cmpl    $48, %eax
+        jb      .L6
+        movq    -112(%rsp), %rdx
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+        .p2align 4,,10
+        .p2align 3
+.L6:
+        mov     %eax, %edx
+        addq    -104(%rsp), %rdx
+        addl    $8, %eax
+        movl    %eax, -120(%rsp)
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+
+and it gets compiled into this on x86:
+       pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        leal    12(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        leal    16(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        movl    12(%ebp), %eax
+        addl    $4, %esp
+        popl    %ebp
+        ret
+
+gcc 4.3 generates:
+       pushl   %ebp
+        movl    %esp, %ebp
+        movl    12(%ebp), %eax
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
  
-http://developer.apple.com/documentation/Performance/Conceptual/
-Accelerate_sse_migration/index.html
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
  
-e.g. SSE select using and, andnot, or. Various SSE compare translations.
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall.  The expected code is something
+like the following:
+
+       movl    12(%ebp), %eax
+       movl    16(%ebp), %ecx
+       xorl    %edx, %edx
+       divl    %ecx
+       movl    8(%ebp), %eax
+       divl    %ecx
+       movl    %edx, %eax
+       ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+    return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+    return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+normal:
+  ret i32 %sum
+overflow:
+  call void @llvm.trap()
+  unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+       movl    4(%esp), %eax
+       addl    8(%esp), %eax
+       jo      LBB1_2  ## overflow
+LBB1_1:        ## normal
+       ret
+LBB1_2:        ## overflow
+       ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+       movl    %ebx, %eax
+       imull   (%edi,%ecx,4)
+       shrdl   $31, %edx, %eax
+       addl    %eax, (%esi,%ecx,4)
+       incl    %ecx
+       cmpl    $149, %ecx
+       jle     .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1:        ## bb1
+       movl    24(%esp), %eax
+       movl    (%eax,%edi,4), %ebx
+       movl    %ebx, %ebp
+       imull   %esi, %ebp
+       movl    %ebx, %eax
+       mull    %ecx
+       addl    %ebp, %edx
+       sarl    $31, %ebx
+       imull   %ecx, %ebx
+       addl    %edx, %ebx
+       shldl   $1, %eax, %ebx
+       movl    20(%esp), %eax
+       addl    %ebx, (%eax,%edi,4)
+       incl    %edi
+       cmpl    $150, %edi
+       jne     LBB1_1  ## bb1
+
+The issue is that we hoist the cast of "scaler" to long long outside of the
+loop, the value comes into the loop as two values, and
+RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
+constructed BUILD_PAIR which represents the cast value.
+
+This can be handled by making CodeGenPrepare sink the cast.
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+       %tmp = alloca <16 x float>, align 16
+       %tmp2 = alloca <16 x float>, align 16
+       store <16 x float> %A, <16 x float>* %tmp
+       %s = bitcast <16 x float>* %tmp to i8*
+       %s2 = bitcast <16 x float>* %tmp2 to i8*
+       call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+       %R = load <16 x float>* %tmp2
+       ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+       subl    $140, %esp
+       movaps  %xmm3, 112(%esp)
+       movaps  %xmm2, 96(%esp)
+       movaps  %xmm1, 80(%esp)
+       movaps  %xmm0, 64(%esp)
+       movl    60(%esp), %eax
+       movl    %eax, 124(%esp)
+       movl    56(%esp), %eax
+       movl    %eax, 120(%esp)
+       movl    52(%esp), %eax
+        <many many more 32-bit copies>
+       movaps  (%esp), %xmm0
+       movaps  16(%esp), %xmm1
+       movaps  32(%esp), %xmm2
+       movaps  48(%esp), %xmm3
+       addl    $140, %esp
+       ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors.  GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+   preceded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+   code contains more than 3 branches.
+   
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+  Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+       movl    4(%esp), %eax
+       shrl    $4, %eax
+       andl    $15, %eax
+       ret
+
+Ideal output:
+       movzbl  4(%esp), %eax
+       shrl    $4, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//
+
+struct B {
+  unsigned char y0 : 1;
+};
+
+int bar(struct B* a) { return a->y0; }
+
+define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
+  %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
+  %2 = load i8* %1, align 1
+  %3 = and i8 %2, 1
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+bar:                                    # @bar
+# BB#0:
+        movb    (%rdi), %al
+        andb    $1, %al
+        movzbl  %al, %eax
+        ret
+
+Missed optimization: should be movl+andl.
+
+//===---------------------------------------------------------------------===//
+
+The x86_64 abi says:
+
+Booleans, when stored in a memory object, are stored as single byte objects the
+value of which is always 0 (false) or 1 (true).
+
+We are not using this fact:
+
+int bar(_Bool *a) { return *a; }
+
+define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
+  %1 = load i8* %a, align 1, !tbaa !0
+  %tmp = and i8 %1, 1
+  %2 = zext i8 %tmp to i32
+  ret i32 %2
+}
+
+bar:
+        movb    (%rdi), %al
+        andb    $1, %al
+        movzbl  %al, %eax
+        ret
+
+GCC produces
+
+bar:
+        movzbl  (%rdi), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider the following two functions compiled with clang:
+_Bool foo(int *x) { return !(*x & 4); }
+unsigned bar(int *x) { return !(*x & 4); }
+
+foo:
+       movl    4(%esp), %eax
+       testb   $4, (%eax)
+       sete    %al
+       movzbl  %al, %eax
+       ret
+
+bar:
+       movl    4(%esp), %eax
+       movl    (%eax), %eax
+       shrl    $2, %eax
+       andl    $1, %eax
+       xorl    $1, %eax
+       ret
+
+The second function generates more code even though the two functions are
+are functionally identical.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+  %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
+  %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
+  %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
+  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
+  ret i32 %conv5
+}
+
+And the following x86 code:
+       xorl    %esi, %edi
+       testb   $-1, %dil
+       sete    %al
+       movzbl  %al, %eax
+       ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+  %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
+  %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
+  %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
+  %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
+  %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
+  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
+  ret i32 %conv5
+}
+
+And the following x86 code:
+       movsbl  %sil, %eax
+       movsbl  %dil, %ecx
+       cmpl    %eax, %ecx
+       sete    %al
+       movzbl  %al, %eax
+       ret
+
+
+It should be possible to eliminate the sign extensions.
+
+//===---------------------------------------------------------------------===//
+
+LLVM misses a load+store narrowing opportunity in this code:
+
+%struct.bf = type { i64, i16, i16, i32 }
+
+@bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
+
+define void @t1() nounwind ssp {
+entry:
+  %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
+  %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
+  %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
+  %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
+  %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
+  store i32 %4, i32* %2, align 1
+  %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
+  %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
+  %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
+  %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
+  %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
+  store i32 %9, i32* %7, align 1
+  ret void
+}
+
+LLVM currently emits this:
+
+  movq  bfi(%rip), %rax
+  andl  $-65537, 8(%rax)
+  movq  bfi(%rip), %rax
+  andl  $-131073, 8(%rax)
+  ret
+
+It could narrow the loads and stores to emit this:
+
+  movq  bfi(%rip), %rax
+  andb  $-2, 10(%rax)
+  movq  bfi(%rip), %rax
+  andb  $-3, 10(%rax)
+  ret
+
+The trouble is that there is a TokenFactor between the store and the
+load, making it non-trivial to determine if there's anything between
+the load and the store which would prohibit narrowing.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+void foo(unsigned x) {
+  if (x == 0) bar();
+  else if (x == 1) qux();
+}
+
+currently compiles into:
+_foo:
+       movl    4(%esp), %eax
+       cmpl    $1, %eax
+       je      LBB0_3
+       testl   %eax, %eax
+       jne     LBB0_4
+
+the testl could be removed:
+_foo:
+       movl    4(%esp), %eax
+       cmpl    $1, %eax
+       je      LBB0_3
+       jb      LBB0_4
+
+0 is the only unsigned number < 1.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+%0 = type { i32, i1 }
+
+define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
+entry:
+  %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
+  %cmp = extractvalue %0 %uadd, 1
+  %inc = zext i1 %cmp to i32
+  %add = add i32 %x, %sum
+  %z.0 = add i32 %add, %inc
+  ret i32 %z.0
+}
+
+declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+
+compiles to:
+
+_add32carry:                            ## @add32carry
+       addl    %esi, %edi
+       sbbl    %ecx, %ecx
+       movl    %edi, %eax
+       subl    %ecx, %eax
+       ret
+
+But it could be:
+
+_add32carry:
+       leal    (%rsi,%rdi), %eax
+       cmpl    %esi, %eax
+       adcl    $0, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+The hot loop of 256.bzip2 contains code that looks a bit like this:
+
+int foo(char *P, char *Q, int x, int y) {
+  if (P[0] != Q[0])
+     return P[0] < Q[0];
+  if (P[1] != Q[1])
+     return P[1] < Q[1];
+  if (P[2] != Q[2])
+     return P[2] < Q[2];
+   return P[3] < Q[3];
+}
+
+In the real code, we get a lot more wrong than this.  However, even in this
+code we generate:
+
+_foo:                                   ## @foo
+## BB#0:                                ## %entry
+       movb    (%rsi), %al
+       movb    (%rdi), %cl
+       cmpb    %al, %cl
+       je      LBB0_2
+LBB0_1:                                 ## %if.then
+       cmpb    %al, %cl
+       jmp     LBB0_5
+LBB0_2:                                 ## %if.end
+       movb    1(%rsi), %al
+       movb    1(%rdi), %cl
+       cmpb    %al, %cl
+       jne     LBB0_1
+## BB#3:                                ## %if.end38
+       movb    2(%rsi), %al
+       movb    2(%rdi), %cl
+       cmpb    %al, %cl
+       jne     LBB0_1
+## BB#4:                                ## %if.end60
+       movb    3(%rdi), %al
+       cmpb    3(%rsi), %al
+LBB0_5:                                 ## %if.end60
+       setl    %al
+       movzbl  %al, %eax
+       ret
+
+Note that we generate jumps to LBB0_1 which does a redundant compare.  The
+redundant compare also forces the register values to be live, which prevents
+folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
+
+_foo:
+       movzbl  (%rsi), %eax
+       cmpb    %al, (%rdi)
+       jne     L10
+L12:
+       movzbl  1(%rsi), %eax
+       cmpb    %al, 1(%rdi)
+       jne     L10
+       movzbl  2(%rsi), %eax
+       cmpb    %al, 2(%rdi)
+       jne     L10
+       movzbl  3(%rdi), %eax
+       cmpb    3(%rsi), %al
+L10:
+       setl    %al
+       movzbl  %al, %eax
+       ret
+
+which is "perfect".
+
+//===---------------------------------------------------------------------===//
+
+For the branch in the following code:
+int a();
+int b(int x, int y) {
+  if (x & (1<<(y&7)))
+    return a();
+  return y;
+}
+
+We currently generate:
+       movb    %sil, %al
+       andb    $7, %al
+       movzbl  %al, %eax
+       btl     %eax, %edi
+       jae     .LBB0_2
+
+movl+andl would be shorter than the movb+andb+movzbl sequence.
+
+//===---------------------------------------------------------------------===//
+
+For the following:
+struct u1 {
+    float x, y;
+};
+float foo(struct u1 u) {
+    return u.x + u.y;
+}
+
+We currently generate:
+       movdqa  %xmm0, %xmm1
+       pshufd  $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
+       addss   %xmm1, %xmm0
+       ret
+
+We could save an instruction here by commuting the addss.
+
+//===---------------------------------------------------------------------===//
+
+This (from PR9661):
+
+float clamp_float(float a) {
+        if (a > 1.0f)
+                return 1.0f;
+        else if (a < 0.0f)
+                return 0.0f;
+        else
+                return a;
+}
+
+Could compile to:
+
+clamp_float:                            # @clamp_float
+        movss   .LCPI0_0(%rip), %xmm1
+        minss   %xmm1, %xmm0
+        pxor    %xmm1, %xmm1
+        maxss   %xmm1, %xmm0
+        ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+        if (a > 5)
+                a = 5;
+        if (a < 0) 
+                return 0;
+        return a;
+}
+
+Compiles to:
+
+_clamp2:                                ## @clamp2
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $5, %edi
+        movl    $5, %ecx
+        cmovlel %edi, %ecx
+        testl   %ecx, %ecx
+        movl    $0, %eax
+        cmovnsl %ecx, %eax
+        popq    %rbp
+        ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR48986.  We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+    if (__sync_fetch_and_add(p, -1) == 1)
+      bar();
+}
+
+into:
+       movl    $-1, %eax
+       lock
+       xaddl   %eax, (%rdi)
+       cmpl    $1, %eax
+       je      LBB0_2
+
+Instead we could generate:
+
+       lock
+       dec %rdi
+       je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
+unsigned t(unsigned a, unsigned b) {
+  return a <= b ? 5 : -5;
+}
+
+We generate:
+       movl    $5, %ecx
+       cmpl    %esi, %edi
+       movl    $-5, %eax
+       cmovbel %ecx, %eax
+
+GCC:
+       cmpl    %edi, %esi
+       sbbl    %eax, %eax
+       andl    $-10, %eax
+       addl    $5, %eax
+
+//===---------------------------------------------------------------------===//