X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=4464878ce2173c49c827b5a04cb9314243d461b3;hb=b9e126ce7de05d580d9eb3b664487dab07304939;hp=759c7acf389173e3d52de301f3720cb1855c3751;hpb=63079f0757785c5c461bafdd3101ee40aeb717fe;p=oota-llvm.git

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 759c7acf389..4464878ce21 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,11 +2,8 @@
 // Random ideas for the X86 backend.
 //===---------------------------------------------------------------------===//
 
-Missing features:
-  - Support for SSE4: http://www.intel.com/software/penryn
-http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
-  - support for 3DNow!
-  - weird abis?
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?)  This is available on Atom processors.
 
 //===---------------------------------------------------------------------===//
 
@@ -54,6 +51,17 @@ One better solution for 1LL << x is:
 
 But that requires good 8-bit subreg support.
 
+Also, this might be better.  It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+        movl %ecx, %eax
+        shrl $5, %eax
+        movl %eax, %edx
+        xorl $1, %edx
+        sall %cl, %eax
+        sall %cl. %edx
+
 64-bit shifts (in general) expand to really bad code.  Instead of using
 cmovs, we should expand to a conditional branch like GCC produces.
 
@@ -67,6 +75,9 @@ into:
         xorl    $1, %eax
         ret
 
+(Although note that this isn't a legal way to express the code that llvm-gcc
+currently generates for that function.)
+
 //===---------------------------------------------------------------------===//
 
 Some isel ideas:
@@ -94,34 +105,6 @@ the coalescer how to deal with it though.
 
 //===---------------------------------------------------------------------===//
 
-Count leading zeros and count trailing zeros:
-
-int clz(int X) { return __builtin_clz(X); }
-int ctz(int X) { return __builtin_ctz(X); }
-
-$ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
-clz:
-        bsr     %eax, DWORD PTR [%esp+4]
-        xor     %eax, 31
-        ret
-ctz:
-        bsf     %eax, DWORD PTR [%esp+4]
-        ret
-
-however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
-aren't.
-
-Another example (use predsimplify to eliminate a select):
-
-int foo (unsigned long j) {
-  if (j)
-    return __builtin_ffs (j) - 1;
-  else
-    return 0;
-}
-
-//===---------------------------------------------------------------------===//
-
 It appears icc use push for parameter passing. Need to investigate.
 
 //===---------------------------------------------------------------------===//
@@ -208,9 +191,9 @@ when we can spare a register. It reduces code size.
 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 get this:
 
-int %test1(int %X) {
-        %Y = div int %X, 8
-        ret int %Y
+define i32 @test1(i32 %X) {
+    %Y = sdiv i32 %X, 8
+    ret i32 %Y
 }
 
 _test1:
@@ -236,32 +219,6 @@ which is probably slower, but it's interesting at least :)
 
 //===---------------------------------------------------------------------===//
 
-The first BB of this code:
-
-declare bool %foo()
-int %bar() {
-        %V = call bool %foo()
-        br bool %V, label %T, label %F
-T:
-        ret int 1
-F:
-        call bool %foo()
-        ret int 12
-}
-
-compiles to:
-
-_bar:
-        subl $12, %esp
-        call L_foo$stub
-        xorb $1, %al
-        testb %al, %al
-        jne LBB_bar_2   # F
-
-It would be better to emit "cmp %al, 1" than a xor and test.
-
-//===---------------------------------------------------------------------===//
-
 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 We should leave these as libcalls for everything over a much lower threshold,
 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
@@ -278,113 +235,17 @@ Optimize copysign(x, *y) to use an integer load from y.
 
 //===---------------------------------------------------------------------===//
 
-%X = weak global int 0
-
-void %foo(int %N) {
-	%N = cast int %N to uint
-	%tmp.24 = setgt int %N, 0
-	br bool %tmp.24, label %no_exit, label %return
-
-no_exit:
-	%indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
-	%i.0.0 = cast uint %indvar to int
-	volatile store int %i.0.0, int* %X
-	%indvar.next = add uint %indvar, 1
-	%exitcond = seteq uint %indvar.next, %N
-	br bool %exitcond, label %return, label %no_exit
-
-return:
-	ret void
-}
-
-compiles into:
-
-	.text
-	.align	4
-	.globl	_foo
-_foo:
-	movl 4(%esp), %eax
-	cmpl $1, %eax
-	jl LBB_foo_4	# return
-LBB_foo_1:	# no_exit.preheader
-	xorl %ecx, %ecx
-LBB_foo_2:	# no_exit
-	movl L_X$non_lazy_ptr, %edx
-	movl %ecx, (%edx)
-	incl %ecx
-	cmpl %eax, %ecx
-	jne LBB_foo_2	# no_exit
-LBB_foo_3:	# return.loopexit
-LBB_foo_4:	# return
-	ret
-
-We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
-remateralization is implemented. This can be accomplished with 1) a target
-dependent LICM pass or 2) makeing SelectDAG represent the whole function. 
-
-//===---------------------------------------------------------------------===//
-
 The following tests perform worse with LSR:
 
 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 
 //===---------------------------------------------------------------------===//
 
-We are generating far worse code than gcc:
-
-volatile short X, Y;
-
-void foo(int N) {
-  int i;
-  for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-LBB1_1:	# entry.bb_crit_edge
-	xorl	%ecx, %ecx
-	xorw	%dx, %dx
-LBB1_2:	# bb
-	movl	L_X$non_lazy_ptr, %esi
-	movw	%cx, (%esi)
-	movl	L_Y$non_lazy_ptr, %esi
-	movw	%dx, (%esi)
-	addw	$4, %dx
-	incl	%ecx
-	cmpl	%eax, %ecx
-	jne	LBB1_2	# bb
-
-vs.
-
-	xorl	%edx, %edx
-	movl	L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
-	movl	L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
-L4:
-	movw	%dx, (%esi)
-	leal	0(,%edx,4), %eax
-	movw	%ax, (%ecx)
-	addl	$1, %edx
-	cmpl	%edx, %edi
-	jne	L4
-
-This is due to the lack of post regalloc LICM.
-
-//===---------------------------------------------------------------------===//
-
 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 FR64 to VR128.
 
 //===---------------------------------------------------------------------===//
 
-mov $reg, 48(%esp)
-...
-leal 48(%esp), %eax
-mov %eax, (%esp)
-call _foo
-
-Obviously it would have been better for the first mov (or any op) to store
-directly %esp[0] if there are no other uses.
-
-//===---------------------------------------------------------------------===//
-
 Adding to the list of cmp / test poor codegen issues:
 
 int test(__m128 *A, __m128 *B) {
@@ -425,75 +286,6 @@ There is also one case we do worse on PPC.
 
 //===---------------------------------------------------------------------===//
 
-If shorter, we should use things like:
-movzwl %ax, %eax
-instead of:
-andl $65535, %EAX
-
-The former can also be used when the two-addressy nature of the 'and' would
-require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
-
-//===---------------------------------------------------------------------===//
-
-Consider this:
-
-typedef struct pair { float A, B; } pair;
-void pairtest(pair P, float *FP) {
-        *FP = P.A+P.B;
-}
-
-We currently generate this code with llvmgcc4:
-
-_pairtest:
-        movl 8(%esp), %eax
-        movl 4(%esp), %ecx
-        movd %eax, %xmm0
-        movd %ecx, %xmm1
-        addss %xmm0, %xmm1
-        movl 12(%esp), %eax
-        movss %xmm1, (%eax)
-        ret
-
-we should be able to generate:
-_pairtest:
-        movss 4(%esp), %xmm0
-        movl 12(%esp), %eax
-        addss 8(%esp), %xmm0
-        movss %xmm0, (%eax)
-        ret
-
-The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
-integer chunks.  It does this so that structs like {short,short} are passed in
-a single 32-bit integer stack slot.  We should handle the safe cases above much
-nicer, while still handling the hard cases.
-
-While true in general, in this specific case we could do better by promoting
-load int + bitcast to float -> load fload.  This basically needs alignment info,
-the code is already implemented (but disabled) in dag combine).
-
-//===---------------------------------------------------------------------===//
-
-Another instruction selector deficiency:
-
-void %bar() {
-	%tmp = load int (int)** %foo
-	%tmp = tail call int %tmp( int 3 )
-	ret void
-}
-
-_bar:
-	subl $12, %esp
-	movl L_foo$non_lazy_ptr, %eax
-	movl (%eax), %eax
-	call *%eax
-	addl $12, %esp
-	ret
-
-The current isel scheme will not allow the load to be folded in the call since
-the load's chain result is read by the callseq_start.
-
-//===---------------------------------------------------------------------===//
-
 For this:
 
 int test(int a)
@@ -519,21 +311,30 @@ estimate to determine whether the match is profitable.
 However, if we care more about code size, then imull is better. It's two bytes
 shorter than movl + leal.
 
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
 //===---------------------------------------------------------------------===//
 
-Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
+__builtin_ffs codegen is messy.
 
-int ctz_(unsigned X) { return __builtin_ctz(X); }
-int clz_(unsigned X) { return __builtin_clz(X); }
 int ffs_(unsigned X) { return __builtin_ffs(X); }
 
-_ctz_:
-        bsfl    4(%esp), %eax
-        ret
-_clz_:
-        bsrl    4(%esp), %eax
-        xorl    $31, %eax
+llvm produces:
+ffs_:
+        movl    4(%esp), %ecx
+        bsfl    %ecx, %eax
+        movl    $32, %edx
+        cmove   %edx, %eax
+        incl    %eax
+        xorl    %edx, %edx
+        testl   %ecx, %ecx
+        cmove   %edx, %eax
         ret
+
+vs gcc:
+
 _ffs_:
         movl    $-1, %edx
         bsfl    4(%esp), %eax
@@ -541,6 +342,15 @@ _ffs_:
         addl    $1, %eax
         ret
 
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
 //===---------------------------------------------------------------------===//
 
 It appears gcc place string data with linkonce linkage in
@@ -551,25 +361,24 @@ do not make use of.
 
 //===---------------------------------------------------------------------===//
 
-int %foo(int* %a, int %t) {
+define i32 @foo(i32* %a, i32 %t) {
 entry:
-        br label %cond_true
-
-cond_true:              ; preds = %cond_true, %entry
-        %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]  
-        %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
-        %tmp2 = getelementptr int* %a, int %x.0.0              
-        %tmp3 = load int* %tmp2         ; <int> [#uses=1]
-        %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
-        %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
-        %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
-        %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
-        br bool %tmp, label %bb12, label %cond_true
-
-bb12:           ; preds = %cond_true
-        ret int %tmp7
+	br label %cond_true
+
+cond_true:		; preds = %cond_true, %entry
+	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
+	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
+	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
+	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
+	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
+	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
+	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
+	br i1 %tmp, label %bb12, label %cond_true
+
+bb12:		; preds = %cond_true
+	ret i32 %tmp7
 }
-
 is pessimized by -loop-reduce and -indvars
 
 //===---------------------------------------------------------------------===//
@@ -673,40 +482,11 @@ _usesbb:
 
 //===---------------------------------------------------------------------===//
 
-Currently we don't have elimination of redundant stack manipulations. Consider
-the code:
-
-int %main() {
-entry:
-	call fastcc void %test1( )
-	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
-	ret int 0
-}
-
-declare fastcc void %test1()
-
-declare fastcc void %test2(sbyte*)
-
-
-This currently compiles to:
-
-	subl $16, %esp
-	call _test5
-	addl $12, %esp
-	subl $16, %esp
-	movl $_test5, (%esp)
-	call _test6
-	addl $12, %esp
-
-The add\sub pair is really unneeded here.
-
-//===---------------------------------------------------------------------===//
-
 Consider the expansion of:
 
-uint %test3(uint %X) {
-        %tmp1 = rem uint %X, 255
-        ret uint %tmp1
+define i32 @test3(i32 %X) {
+        %tmp1 = urem i32 %X, 255
+        ret i32 %tmp1
 }
 
 Currently it compiles to:
@@ -740,9 +520,9 @@ imagine there has to be some kind of complicated decoder reset and realignment
 to grab the bytes from the next cacheline.
 
 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
-942  942 0x3d03 movl     %dh, (1809(%esp, %esi)                                                                          
-937  937 0x3d0a incl     %esi                           
-3    3   0x3d0b cmpb     %bl, %dl                                               
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
+937  937 0x3d0a incl     %esi
+3    3   0x3d0b cmpb     %bl, %dl
 27   27  0x3d0d jnz      0x000062db <main+11707>
 
 //===---------------------------------------------------------------------===//
@@ -948,22 +728,22 @@ Another example is:
 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 
 ; Check that the shift gets turned into an LEA.
-; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
 ; RUN:   not grep {mov E.X, E.X}
 
-%G = external global int
+@G = external global i32		; <i32*> [#uses=3]
 
-int %test1(int %X, int %Y) {
-        %Z = add int %X, %Y
-        volatile store int %Y, int* %G
-        volatile store int %Z, int* %G
-        ret int %X
+define i32 @test1(i32 %X, i32 %Y) {
+	%Z = add i32 %X, %Y		; <i32> [#uses=1]
+	volatile store i32 %Y, i32* @G
+	volatile store i32 %Z, i32* @G
+	ret i32 %X
 }
 
-int %test2(int %X) {
-        %Z = add int %X, 1  ;; inc
-        volatile store int %Z, int* %G
-        ret int %X
+define i32 @test2(i32 %X) {
+	%Z = add i32 %X, 1		; <i32> [#uses=1]
+	volatile store i32 %Z, i32* @G
+	ret i32 %X
 }
 
 //===---------------------------------------------------------------------===//
@@ -998,51 +778,6 @@ _test:
 
 //===---------------------------------------------------------------------===//
 
-For code like:
-phi (undef, x)
-
-We get an implicit def on the undef side. If the phi is spilled, we then get:
-implicitdef xmm1
-store xmm1 -> stack
-
-It should be possible to teach the x86 backend to "fold" the store into the
-implicitdef, which just deletes the implicit def.
-
-These instructions should go away:
-#IMPLICIT_DEF %xmm1 
-movaps %xmm1, 192(%esp) 
-movaps %xmm1, 224(%esp) 
-movaps %xmm1, 176(%esp)
-
-//===---------------------------------------------------------------------===//
-
-This is a "commutable two-address" register coallescing deficiency:
-
-define <4 x float> @test1(<4 x float> %V) {
-entry:
-        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
-                                        <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
-        %add = add <4 x float> %tmp8, %V
-        ret <4 x float> %add
-}
-
-this codegens to:
-
-_test1:
-        pshufd  $27, %xmm0, %xmm1
-        addps   %xmm0, %xmm1
-        movaps  %xmm1, %xmm0
-        ret
-
-instead of:
-
-_test1:
-        pshufd  $27, %xmm0, %xmm1
-        addps   %xmm1, %xmm0
-        ret
-
-//===---------------------------------------------------------------------===//
-
 Leaf functions that require one 4-byte spill slot have a prolog like this:
 
 _foo:
@@ -1119,6 +854,8 @@ Should compile to:
                 setae   %al
                 ret
 
+FIXME: That code looks wrong; bool return is normally defined as zext.
+
 on x86-64, not:
 
 __Z11no_overflowjj:
@@ -1136,34 +873,6 @@ condition register is dead. xor reg reg is shorter than mov reg, #0.
 
 //===---------------------------------------------------------------------===//
 
-We aren't matching RMW instructions aggressively
-enough.  Here's a reduced testcase (more in PR1160):
-
-define void @test(i32* %huge_ptr, i32* %target_ptr) {
-        %A = load i32* %huge_ptr                ; <i32> [#uses=1]
-        %B = load i32* %target_ptr              ; <i32> [#uses=1]
-        %C = or i32 %A, %B              ; <i32> [#uses=1]
-        store i32 %C, i32* %target_ptr
-        ret void
-}
-
-$ llvm-as < t.ll | llc -march=x86-64
-
-_test:
-        movl (%rdi), %eax
-        orl (%rsi), %eax
-        movl %eax, (%rsi)
-        ret
-
-That should be something like:
-
-_test:
-        movl (%rdi), %eax
-        orl %eax, (%rsi)
-        ret
-
-//===---------------------------------------------------------------------===//
-
 The following code:
 
 bb114.preheader:		; preds = %cond_next94
@@ -1216,30 +925,6 @@ vice-versa).
 
 //===---------------------------------------------------------------------===//
 
-For this code:
-
-cond_next603:		; preds = %bb493, %cond_true336, %cond_next599
-	%v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]		; <i32> [#uses=1]
-	%maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]		; <i32> [#uses=2]
-	%cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]		; <i32> [#uses=2]
-	%byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]		; <i8*> [#uses=9]
-	%bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]		; <i32> [#uses=4]
-	%source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]		; <i32> [#uses=7]
-	%tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1		; <i8*> [#uses=1]
-	%tmp607 = load i8* %tmp606, align 1		; <i8> [#uses=1]
-
-We produce this:
-
-LBB4_70:	# cond_next603
-	movl	-20(%ebp), %esi
-	movl	L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
-
-However, ICC caches this information before the loop and produces this:
-
-        movl      88(%esp), %eax                                #481.12
-
-//===---------------------------------------------------------------------===//
-
 This code:
 
 	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
@@ -1256,37 +941,6 @@ suggests using the 32-bit register (which is what ICC uses).
 
 //===---------------------------------------------------------------------===//
 
-rdar://5506677 - We compile this:
-
-define i32 @foo(double %x) {
-        %x14 = bitcast double %x to i64         ; <i64> [#uses=1]
-        %tmp713 = trunc i64 %x14 to i32         ; <i32> [#uses=1]
-        %tmp8 = and i32 %tmp713, 2147483647             ; <i32> [#uses=1]
-        ret i32 %tmp8
-}
-
-to:
-
-_foo:
-        subl    $12, %esp
-        fldl    16(%esp)
-        fstpl   (%esp)
-        movl    $2147483647, %eax
-        andl    (%esp), %eax
-        addl    $12, %esp
-        #FP_REG_KILL
-        ret
-
-It would be much better to eliminate the fldl/fstpl by folding the bitcast 
-into the load SDNode.  That would give us:
-
-_foo:
-        movl    $2147483647, %eax
-        andl    4(%esp), %eax
-        ret
-
-//===---------------------------------------------------------------------===//
-
 We compile this:
 
 void compare (long long foo) {
@@ -1296,44 +950,54 @@ void compare (long long foo) {
 
 to:
 
-_compare:
-        subl    $12, %esp
-        cmpl    $0, 16(%esp)
+compare:
+        subl    $4, %esp
+        cmpl    $0, 8(%esp)
         setne   %al
         movzbw  %al, %ax
-        cmpl    $1, 20(%esp)
+        cmpl    $1, 12(%esp)
         setg    %cl
         movzbw  %cl, %cx
         cmove   %ax, %cx
-        movw    %cx, %ax
-        testb   $1, %al
-        je      LBB1_2  # cond_true
+        testb   $1, %cl
+        jne     .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        call    abort
+.LBB1_2:        # UnifiedReturnBlock
+        addl    $4, %esp
+        ret
 
 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 compares.  GCC produces multiple branches, which is much nicer:
 
-_compare:
-        pushl   %ebp
-        movl    %esp, %ebp
-        subl    $8, %esp
-        movl    8(%ebp), %eax
-        movl    12(%ebp), %edx
-        subl    $1, %edx
-        jg     L5
-L7:
-        jl      L4
+compare:
+        subl    $12, %esp
+        movl    20(%esp), %edx
+        movl    16(%esp), %eax
+        decl    %edx
+        jle     .L7
+.L5:
+        addl    $12, %esp
+        ret
+        .p2align 4,,7
+.L7:
+        jl      .L4
         cmpl    $0, %eax
-        jbe      L4
-L5:
+        .p2align 4,,8
+        ja      .L5
+.L4:
+        .p2align 4,,9
+        call    abort
 
 //===---------------------------------------------------------------------===//
 
 Tail call optimization improvements: Tail call optimization currently
 pushes all arguments on the top of the stack (their normal place for
-non-tail call optimized calls) before moving them to actual stack
-slot. This is done to prevent overwriting of parameters (see example
-below) that might be used, since the arguments of the callee
-overwrites caller's arguments.
+non-tail call optimized calls) that source from the callers arguments
+or  that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
 
 example:  
 
@@ -1352,13 +1016,6 @@ arg2 of the caller.
 
 Possible optimizations:
 
- - Only push those arguments to the top of the stack that are actual
-   parameters of the caller function and have no local value in the
-   caller.
-
-   In the above example local does not need to be pushed onto the top
-   of the stack as it is definitely not a caller's function
-   parameter.
 
  - Analyse the actual parameters of the callee to see which would
    overwrite a caller parameter which is used by the callee and only
@@ -1380,35 +1037,6 @@ Possible optimizations:
    Here we need to push the arguments because they overwrite each
    other.
 
-
-   Code for lowering directly onto callers arguments:
-+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
-+  SmallVector<SDOperand, 8> MemOpChains;
-+
-+  SDOperand FramePtr;
-+  SDOperand PtrOff;
-+  SDOperand FIN;
-+  int FI = 0;
-+  // Walk the register/memloc assignments, inserting copies/loads.
-+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-+    CCValAssign &VA = ArgLocs[i];
-+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
-+    
-+    ....
-+    
-+    if (VA.isRegLoc()) {
-+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-+    } else {
-+      assert(VA.isMemLoc());
-+      // create frame index
-+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
-+      uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
-+      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
-+      FIN = DAG.getFrameIndex(FI, MVT::i32);
-+      // store relative to framepointer
-+      MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
-+    }
-+  }
 //===---------------------------------------------------------------------===//
 
 main ()
@@ -1503,7 +1131,7 @@ Should compile into:
 
 _foo:
         movzwl  4(%esp), %eax
-        orb     $-1, %al           ;; 'orl 255' is also fine :)
+        orl     $255, %eax
         ret
 
 instead of:
@@ -1515,23 +1143,44 @@ _foo:
 
 //===---------------------------------------------------------------------===//
 
-We're missing an obvious fold of a load into imul:
+We're codegen'ing multiply of long longs inefficiently:
 
-int test(long a, long b) { return a * b; } 
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+  return arg1 *  arg2;
+}
 
-LLVM produces:
-_test:
-        movl    4(%esp), %ecx
-        movl    8(%esp), %eax
-        imull   %ecx, %eax
-        ret
+We compile to (fomit-frame-pointer):
 
-vs:
-_test:
-        movl    8(%esp), %eax
-        imull   4(%esp), %eax
+_LLM:
+	pushl	%esi
+	movl	8(%esp), %ecx
+	movl	16(%esp), %esi
+	movl	%esi, %eax
+	mull	%ecx
+	imull	12(%esp), %esi
+	addl	%edx, %esi
+	imull	20(%esp), %ecx
+	movl	%esi, %edx
+	addl	%ecx, %edx
+	popl	%esi
+	ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area.  ICC apparently produces:
+
+        movl      8(%esp), %ecx
+        imull     12(%esp), %ecx
+        movl      16(%esp), %eax
+        imull     4(%esp), %eax 
+        addl      %eax, %ecx  
+        movl      4(%esp), %eax
+        mull      12(%esp) 
+        addl      %ecx, %edx
         ret
 
+Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
 //===---------------------------------------------------------------------===//
 
 We can fold a store into "zeroing a reg".  Instead of:
@@ -1545,6 +1194,9 @@ movl    $0, 124(%esp)
 
 if the flags of the xor are dead.
 
+Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
+be folded into: shl [mem], 1
+
 //===---------------------------------------------------------------------===//
 
 This testcase misses a read/modify/write opportunity (from PR1425):
@@ -1597,3 +1249,686 @@ a stride-4 IV, would would allow all the scales in the loop to go away.
 This would result in smaller code and more efficient microops.
 
 //===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+	xorpd	LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+	movsd	(%esp), %xmm0
+	xorpd	LCPI1_0, %xmm0
+	movsd	%xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall.  Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+  double X = *P;
+  a = X;
+  bar();
+  X = -X;
+  b = X;
+  bar();
+  c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+handling llvm.memory.barrier on pre SSE2 cpus
+
+should generate:
+lock ; mov %esp, %esp
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+  long long prod = (long long)a*b;
+  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+        movl    4(%esp), %eax
+        cltd
+        xorl    %edx, %eax
+        subl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+int test(unsigned long a, unsigned long b) { return -(a < b); }
+
+We currently compile this to:
+
+define i32 @test(i32 %a, i32 %b) nounwind  {
+	%tmp3 = icmp ult i32 %a, %b		; <i1> [#uses=1]
+	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	%tmp5 = sub i32 0, %tmp34		; <i32> [#uses=1]
+	ret i32 %tmp5
+}
+
+and
+
+_test:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	setb	%al
+	movzbl	%al, %eax
+	negl	%eax
+	ret
+
+Several deficiencies here.  First, we should instcombine zext+neg into sext:
+
+define i32 @test2(i32 %a, i32 %b) nounwind  {
+	%tmp3 = icmp ult i32 %a, %b		; <i1> [#uses=1]
+	%tmp34 = sext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp34
+}
+
+However, before we can do that, we have to fix the bad codegen that we get for
+sext from bool:
+
+_test2:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	setb	%al
+	movzbl	%al, %eax
+	shll	$31, %eax
+	sarl	$31, %eax
+	ret
+
+This code should be at least as good as the code above.  Once this is fixed, we
+can optimize this specific case even more to:
+
+	movl	8(%esp), %eax
+	xorl	%ecx, %ecx
+        cmpl    %eax, 4(%esp)
+        sbbl    %ecx, %ecx
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from 
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+  if (arg1 >> 48)
+    return (first_one[arg1 >> 48]);
+  return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+        movl    8(%esp), %eax
+        cmpl    $65536, %eax
+        movl    4(%esp), %ecx
+        jb      .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        shrl    $16, %eax
+        movzbl  first_one(%eax), %eax
+        ret
+.LBB1_2:        # UnifiedReturnBlock
+        xorl    %eax, %eax
+        ret
+
+There are a few possible improvements here:
+1. We should be able to eliminate the dead load into %ecx
+2. We could change the "movl 8(%esp), %eax" into
+   "movzwl 10(%esp), %eax"; this lets us change the cmpl
+   into a testl, which is shorter, and eliminate the shift.
+
+We could also in theory eliminate the branch by using a conditional
+for the address of the load, but that seems unlikely to be worthwhile
+in general.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
+entry:
+	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
+	br i1 %tmp2, label %bb7, label %bb
+
+bb:		; preds = %entry
+	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
+	ret i32 %tmp6
+
+bb7:		; preds = %entry
+	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
+	ret i32 %tmp10
+}
+
+to:
+
+_foo:
+	cmpb	$0, 16(%esp)
+	movl	12(%esp), %ecx
+	movl	8(%esp), %eax
+	movl	4(%esp), %edx
+	je	LBB1_2	# bb7
+LBB1_1:	# bb
+	addl	%edx, %eax
+	ret
+LBB1_2:	# bb7
+	movl	%edx, %eax
+	subl	%ecx, %eax
+	ret
+
+The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
+if it commuted the addl in LBB1_1.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15:        # bb310
+        cvtss2sd        LCPI1_0, %xmm1
+        addsd   %xmm1, %xmm0
+        movsd   176(%esp), %xmm2
+        mulsd   %xmm0, %xmm2
+        movapd  %xmm2, %xmm3
+        mulsd   %xmm3, %xmm3
+        movapd  %xmm3, %xmm4
+        mulsd   LCPI1_23, %xmm4
+        addsd   LCPI1_24, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_25, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_26, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_27, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_28, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   %xmm1, %xmm4
+        mulsd   %xmm2, %xmm4
+        movsd   152(%esp), %xmm1
+        addsd   %xmm4, %xmm1
+        movsd   %xmm1, 152(%esp)
+        incl    %eax
+        cmpl    %eax, %esi
+        jge     LBB1_15 # bb310
+LBB1_16:        # bb358.loopexit
+        movsd   152(%esp), %xmm0
+        addsd   %xmm0, %xmm0
+        addsd   LCPI1_22, %xmm0
+        movsd   %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Legalize loses track of the fact that bools are always zero extended when in
+memory.  This causes us to compile abort_gzip (from 164.gzip) from:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind  {
+entry:
+	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
+	br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i:		; preds = %entry
+	tail call void @exit( i32 1 ) noreturn nounwind 
+	unreachable
+bb4.i:		; preds = %entry
+	store i1 true, i1* @in_exit.4870.b
+	tail call void @exit( i32 1 ) noreturn nounwind 
+	unreachable
+}
+declare void @exit(i32) noreturn nounwind 
+
+into:
+
+_abort_gzip:
+	subl	$12, %esp
+	movb	_in_exit.4870.b, %al
+	notb	%al
+	testb	$1, %al
+	jne	LBB1_2	## bb4.i
+LBB1_1:	## bb.i
+  ...
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+  return x-y-1;
+}
+
+into (-m64):
+
+_test:
+	decl	%edi
+	movl	%edi, %eax
+	subl	%esi, %eax
+	ret
+
+it would be better to codegen as: x+~y  (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+	subq    $200, %rsp
+        movaps  %xmm7, 160(%rsp)
+        movaps  %xmm6, 144(%rsp)
+        movaps  %xmm5, 128(%rsp)
+        movaps  %xmm4, 112(%rsp)
+        movaps  %xmm3, 96(%rsp)
+        movaps  %xmm2, 80(%rsp)
+        movaps  %xmm1, 64(%rsp)
+        movaps  %xmm0, 48(%rsp)
+        movq    %r9, 40(%rsp)
+        movq    %r8, 32(%rsp)
+        movq    %rcx, 24(%rsp)
+        movq    %rdx, 16(%rsp)
+        movq    %rsi, 8(%rsp)
+        leaq    (%rsp), %rax
+        movq    %rax, 192(%rsp)
+        leaq    208(%rsp), %rax
+        movq    %rax, 184(%rsp)
+        movl    $48, 180(%rsp)
+        movl    $8, 176(%rsp)
+        movl    176(%rsp), %eax
+        cmpl    $47, %eax
+        jbe     .LBB1_3 # bb
+.LBB1_1:        # bb3
+        movq    184(%rsp), %rcx
+        leaq    8(%rcx), %rax
+        movq    %rax, 184(%rsp)
+.LBB1_2:        # bb4
+        movl    (%rcx), %eax
+        addq    $200, %rsp
+        ret
+.LBB1_3:        # bb
+        movl    %eax, %ecx
+        addl    $8, %eax
+        addq    192(%rsp), %rcx
+        movl    %eax, 176(%rsp)
+        jmp     .LBB1_2 # bb4
+
+gcc 4.3 generates:
+	subq    $96, %rsp
+.LCFI0:
+        leaq    104(%rsp), %rax
+        movq    %rsi, -80(%rsp)
+        movl    $8, -120(%rsp)
+        movq    %rax, -112(%rsp)
+        leaq    -88(%rsp), %rax
+        movq    %rax, -104(%rsp)
+        movl    $8, %eax
+        cmpl    $48, %eax
+        jb      .L6
+        movq    -112(%rsp), %rdx
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+        .p2align 4,,10
+        .p2align 3
+.L6:
+        mov     %eax, %edx
+        addq    -104(%rsp), %rdx
+        addl    $8, %eax
+        movl    %eax, -120(%rsp)
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+
+and it gets compiled into this on x86:
+	pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        leal    12(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        leal    16(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        movl    12(%ebp), %eax
+        addl    $4, %esp
+        popl    %ebp
+        ret
+
+gcc 4.3 generates:
+	pushl   %ebp
+        movl    %esp, %ebp
+        movl    12(%ebp), %eax
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall.  The expected code is something
+like the following:
+
+	movl	12(%ebp), %eax
+	movl	16(%ebp), %ecx
+	xorl	%edx, %edx
+	divl	%ecx
+	movl	8(%ebp), %eax
+	divl	%ecx
+	movl	%edx, %eax
+	ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+    return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+    return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+normal:
+  ret i32 %sum
+overflow:
+  call void @llvm.trap()
+  unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+	movl	4(%esp), %eax
+	addl	8(%esp), %eax
+	jo	LBB1_2	## overflow
+LBB1_1:	## normal
+	ret
+LBB1_2:	## overflow
+	ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+	movl	%ebx, %eax
+	imull	(%edi,%ecx,4)
+	shrdl	$31, %edx, %eax
+	addl	%eax, (%esi,%ecx,4)
+	incl	%ecx
+	cmpl	$149, %ecx
+	jle	.L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1:	## bb1
+	movl	24(%esp), %eax
+	movl	(%eax,%edi,4), %ebx
+	movl	%ebx, %ebp
+	imull	%esi, %ebp
+	movl	%ebx, %eax
+	mull	%ecx
+	addl	%ebp, %edx
+	sarl	$31, %ebx
+	imull	%ecx, %ebx
+	addl	%edx, %ebx
+	shldl	$1, %eax, %ebx
+	movl	20(%esp), %eax
+	addl	%ebx, (%eax,%edi,4)
+	incl	%edi
+	cmpl	$150, %edi
+	jne	LBB1_1	## bb1
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+	%tmp = alloca <16 x float>, align 16
+	%tmp2 = alloca <16 x float>, align 16
+	store <16 x float> %A, <16 x float>* %tmp
+	%s = bitcast <16 x float>* %tmp to i8*
+	%s2 = bitcast <16 x float>* %tmp2 to i8*
+	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+	%R = load <16 x float>* %tmp2
+	ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+	subl	$140, %esp
+	movaps	%xmm3, 112(%esp)
+	movaps	%xmm2, 96(%esp)
+	movaps	%xmm1, 80(%esp)
+	movaps	%xmm0, 64(%esp)
+	movl	60(%esp), %eax
+	movl	%eax, 124(%esp)
+	movl	56(%esp), %eax
+	movl	%eax, 120(%esp)
+	movl	52(%esp), %eax
+        <many many more 32-bit copies>
+      	movaps	(%esp), %xmm0
+	movaps	16(%esp), %xmm1
+	movaps	32(%esp), %xmm2
+	movaps	48(%esp), %xmm3
+	addl	$140, %esp
+	ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors.  GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+   preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+   code contains more than 3 branches.
+   
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+  Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+	movl	4(%esp), %eax
+	andl	$127, %eax
+	cmpl	$31, %eax
+	seta	%al
+	movzbl	%al, %eax
+	ret
+
+Ideal output:
+	xorl	%eax, %eax
+	testl	$96, 4(%esp)
+	setne	%al
+	ret
+
+This should definitely be done in instcombine, canonicalizing the range
+condition into a != condition.  We get this IR:
+
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+	%0 = and i32 %x, 127		; <i32> [#uses=1]
+	%1 = icmp ugt i32 %0, 31		; <i1> [#uses=1]
+	%2 = zext i1 %1 to i32		; <i32> [#uses=1]
+	ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that.  This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+	movl	4(%esp), %eax
+	shrl	$4, %eax
+	andl	$15, %eax
+	ret
+
+Ideal output:
+	movzbl	4(%esp), %eax
+	shrl	$4, %eax
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+	testl	$128, 4(%esp)
+	setne	%al
+	movzbl	%al, %eax
+	shll	$8, %eax
+	ret
+
+Better:
+	movl	4(%esp), %eax
+	addl	%eax, %eax
+	andl	$256, %eax
+	ret
+
+This is another general instcombine transformation that is profitable on all
+targets.  In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+	%0 = and i32 %a, 128
+	%1 = icmp eq i32 %0, 0
+	%iftmp.0.0 = select i1 %1, i32 0, i32 256
+	ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+	%0 = shl i32 %a, 1
+	%1 = and i32 %0, 256
+	ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//