X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=073e2dacef183b26bab7ac32332d4a19f2702471;hb=c59e52108bbfca50b23c5d10706484d4b012c344;hp=5084467657ec8f04dd0a149a05c69d6715bafb8b;hpb=2420d812475ebbb835585db1b2bbad04e55cb6f3;p=oota-llvm.git

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 5084467657e..073e2dacef1 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,6 +2,14 @@
 // Random ideas for the X86 backend.
 //===---------------------------------------------------------------------===//
 
+Missing features:
+  - Support for SSE4: http://www.intel.com/software/penryn
+http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
+  - support for 3DNow!
+  - weird abis?
+
+//===---------------------------------------------------------------------===//
+
 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
 X86, & make the dag combiner produce it when needed.  This will eliminate one
@@ -18,6 +26,26 @@ long long test(int X, int Y) { return (long long)X*Y; }
 
 ... which should only be one imul instruction.
 
+or:
+
+unsigned long long int t2(unsigned int a, unsigned int b) {
+       return (unsigned long long)a * b;
+}
+
+... which should be one mul instruction.
+
+
+This can be done with a custom expander, but it would be nice to move this to
+generic code.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case.  We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
 //===---------------------------------------------------------------------===//
 
 This should be one DIV/IDIV instruction, not a libcall:
@@ -45,6 +73,20 @@ http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
 
 Another useful one would be  ~0ULL >> X and ~0ULL << X.
 
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
 //===---------------------------------------------------------------------===//
 
 Compile this:
@@ -80,15 +122,6 @@ allocator. Delay codegen until post register allocation.
 
 //===---------------------------------------------------------------------===//
 
-Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
-
-	cmpl $1, %eax
-	setg %al
-	testb %al, %al  # unnecessary
-	jne .BB7
-
-//===---------------------------------------------------------------------===//
-
 Count leading zeros and count trailing zeros:
 
 int clz(int X) { return __builtin_clz(X); }
@@ -106,11 +139,18 @@ ctz:
 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 aren't.
 
+Another example (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
 //===---------------------------------------------------------------------===//
 
-Use push/pop instructions in prolog/epilog sequences instead of stores off 
-ESP (certain code size win, perf win on some [which?] processors).
-Also, it appears icc use push for parameter passing. Need to investigate.
+It appears icc use push for parameter passing. Need to investigate.
 
 //===---------------------------------------------------------------------===//
 
@@ -126,6 +166,8 @@ commutative, it is not matched with the load on both sides.  The dag combiner
 should be made smart enough to cannonicalize the load into the RHS of a compare
 when it can invert the result of the compare for free.
 
+//===---------------------------------------------------------------------===//
+
 How about intrinsics? An example is:
   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 
@@ -140,128 +182,6 @@ target specific hook.
 
 //===---------------------------------------------------------------------===//
 
-When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
-other fast SSE modes.
-
-//===---------------------------------------------------------------------===//
-
-Think about doing i64 math in SSE regs.
-
-//===---------------------------------------------------------------------===//
-
-The DAG Isel doesn't fold the loads into the adds in this testcase.  The
-pattern selector does.  This is because the chain value of the load gets 
-selected first, and the loads aren't checking to see if they are only used by
-and add.
-
-.ll:
-
-int %test(int* %x, int* %y, int* %z) {
-        %X = load int* %x
-        %Y = load int* %y
-        %Z = load int* %z
-        %a = add int %X, %Y
-        %b = add int %a, %Z
-        ret int %b
-}
-
-dag isel:
-
-_test:
-        movl 4(%esp), %eax
-        movl (%eax), %eax
-        movl 8(%esp), %ecx
-        movl (%ecx), %ecx
-        addl %ecx, %eax
-        movl 12(%esp), %ecx
-        movl (%ecx), %ecx
-        addl %ecx, %eax
-        ret
-
-pattern isel:
-
-_test:
-        movl 12(%esp), %ecx
-        movl 4(%esp), %edx
-        movl 8(%esp), %eax
-        movl (%eax), %eax
-        addl (%edx), %eax
-        addl (%ecx), %eax
-        ret
-
-This is bad for register pressure, though the dag isel is producing a 
-better schedule. :)
-
-//===---------------------------------------------------------------------===//
-
-This testcase should have no SSE instructions in it, and only one load from
-a constant pool:
-
-double %test3(bool %B) {
-        %C = select bool %B, double 123.412, double 523.01123123
-        ret double %C
-}
-
-Currently, the select is being lowered, which prevents the dag combiner from
-turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
-
-The pattern isel got this one right.
-
-//===---------------------------------------------------------------------===//
-
-SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
-like this:
-
-  X += y
-
-and the register allocator decides to spill X, it is cheaper to emit this as:
-
-Y += [xslot]
-store Y -> [xslot]
-
-than as:
-
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
-
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time).  *Note* however that this can only be done
-if Y is dead.  Here's a testcase:
-
-%.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
-implementation   ; Functions:
-declare void %printf(int, ...)
-void %main() {
-build_tree.exit:
-        br label %no_exit.i7
-no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
-        %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
-        %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
-        %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
-        %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
-        br bool false, label %Compute_Tree.exit23, label %no_exit.i7
-Compute_Tree.exit23:            ; preds = %no_exit.i7
-        tail call void (int, ...)* %printf( int 0 )
-        store double %tmp.34.i18, double* null
-        ret void
-}
-
-We currently emit:
-
-.BBmain_1:
-        xorpd %XMM1, %XMM1
-        addsd %XMM0, %XMM1
-***     movsd %XMM2, QWORD PTR [%ESP + 8]
-***     addsd %XMM2, %XMM1
-***     movsd QWORD PTR [%ESP + 8], %XMM2
-        jmp .BBmain_1   # no_exit.i7
-
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
-
-//===---------------------------------------------------------------------===//
-
 In many cases, LLVM generates code like this:
 
 _test:
@@ -275,7 +195,7 @@ on some processors (which ones?), it is more efficient to do this:
 
 _test:
         movl 8(%esp), %ebx
-	xor %eax, %eax
+        xor  %eax, %eax
         cmpl %ebx, 4(%esp)
         setl %al
         ret
@@ -284,68 +204,6 @@ Doing this correctly is tricky though, as the xor clobbers the flags.
 
 //===---------------------------------------------------------------------===//
 
-We should generate 'test' instead of 'cmp' in various cases, e.g.:
-
-bool %test(int %X) {
-        %Y = shl int %X, ubyte 1
-        %C = seteq int %Y, 0
-        ret bool %C
-}
-bool %test(int %X) {
-        %Y = and int %X, 8
-        %C = seteq int %Y, 0
-        ret bool %C
-}
-
-This may just be a matter of using 'test' to write bigger patterns for X86cmp.
-
-An important case is comparison against zero:
-
-if (X == 0) ...
-
-instead of:
-
-	cmpl $0, %eax
-	je LBB4_2	#cond_next
-
-use:
-	test %eax, %eax
-	jz LBB4_2
-
-which is smaller.
-
-//===---------------------------------------------------------------------===//
-
-SSE should implement 'select_cc' using 'emulated conditional moves' that use
-pcmp/pand/pandn/por to do a selection instead of a conditional branch:
-
-double %X(double %Y, double %Z, double %A, double %B) {
-        %C = setlt double %A, %B
-        %z = add double %Z, 0.0    ;; select operand is not a load
-        %D = select bool %C, double %Y, double %z
-        ret double %D
-}
-
-We currently emit:
-
-_X:
-        subl $12, %esp
-        xorpd %xmm0, %xmm0
-        addsd 24(%esp), %xmm0
-        movsd 32(%esp), %xmm1
-        movsd 16(%esp), %xmm2
-        ucomisd 40(%esp), %xmm1
-        jb LBB_X_2
-LBB_X_1:
-        movsd %xmm0, %xmm2
-LBB_X_2:
-        movsd %xmm2, (%esp)
-        fldl (%esp)
-        addl $12, %esp
-        ret
-
-//===---------------------------------------------------------------------===//
-
 We should generate bts/btr/etc instructions on targets where they are cheap or
 when codesize is important.  e.g., for:
 
@@ -375,12 +233,6 @@ when we can spare a register. It reduces code size.
 
 //===---------------------------------------------------------------------===//
 
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 get this:
 
@@ -412,36 +264,6 @@ which is probably slower, but it's interesting at least :)
 
 //===---------------------------------------------------------------------===//
 
-Currently the x86 codegen isn't very good at mixing SSE and FPStack
-code:
-
-unsigned int foo(double x) { return x; }
-
-foo:
-	subl $20, %esp
-	movsd 24(%esp), %xmm0
-	movsd %xmm0, 8(%esp)
-	fldl 8(%esp)
-	fisttpll (%esp)
-	movl (%esp), %eax
-	addl $20, %esp
-	ret
-
-This will be solved when we go to a dynamic programming based isel.
-
-//===---------------------------------------------------------------------===//
-
-Should generate min/max for stuff like:
-
-void minf(float a, float b, float *X) {
-  *X = a <= b ? a : b;
-}
-
-Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
-and ISD::FMAX node types?
-
-//===---------------------------------------------------------------------===//
-
 The first BB of this code:
 
 declare bool %foo()
@@ -472,22 +294,6 @@ Enable X86InstrInfo::convertToThreeAddress().
 
 //===---------------------------------------------------------------------===//
 
-Investigate whether it is better to codegen the following
-
-        %tmp.1 = mul int %x, 9
-to
-
-	movl	4(%esp), %eax
-	leal	(%eax,%eax,8), %eax
-
-as opposed to what llc is currently generating:
-
-	imull $9, 4(%esp), %eax
-
-Currently the load folding imull has a higher complexity than the LEA32 pattern.
-
-//===---------------------------------------------------------------------===//
-
 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 We should leave these as libcalls for everything over a much lower threshold,
 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
@@ -495,45 +301,6 @@ stores, TLB preheating, etc)
 
 //===---------------------------------------------------------------------===//
 
-Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
-feasible.
-
-//===---------------------------------------------------------------------===//
-
-Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
-the reg-reg copy in this example:
-
-float foo(int *x, float *y, unsigned c) {
-  float res = 0.0;
-  unsigned i;
-  for (i = 0; i < c; i++) {
-    float xx = (float)x[i];
-    xx = xx * y[i];
-    xx += res;
-    res = xx;
-  }
-  return res;
-}
-
-LBB_foo_3:      # no_exit
-        cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
-        mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
-        addss %XMM0, %XMM1
-        inc %ESI
-        cmp %ESI, %ECX
-****    movaps %XMM1, %XMM0
-        jb LBB_foo_3    # no_exit
-
-//===---------------------------------------------------------------------===//
-
-Codegen:
-  if (copysign(1.0, x) == copysign(1.0, y))
-into:
-  if (x^y & mask)
-when using SSE.
-
-//===---------------------------------------------------------------------===//
-
 Optimize this into something reasonable:
  x * copysign(1.0, y) * copysign(1.0, z)
 
@@ -595,52 +362,64 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 
 //===---------------------------------------------------------------------===//
 
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
+We are generating far worse code than gcc:
 
-//===---------------------------------------------------------------------===//
+volatile short X, Y;
 
-mov $reg, 48(%esp)
-...
-leal 48(%esp), %eax
-mov %eax, (%esp)
-call _foo
+void foo(int N) {
+  int i;
+  for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
 
-Obviously it would have been better for the first mov (or any op) to store
-directly %esp[0] if there are no other uses.
+LBB1_1:	#bb.preheader
+	xorl %ecx, %ecx
+	xorw %dx, %dx
+LBB1_2:	#bb
+	movl L_X$non_lazy_ptr, %esi
+	movw %dx, (%esi)
+	movw %dx, %si
+	shlw $2, %si
+	movl L_Y$non_lazy_ptr, %edi
+	movw %si, (%edi)
+	incl %ecx
+	incw %dx
+	cmpl %eax, %ecx
+	jne LBB1_2	#bb
 
-//===---------------------------------------------------------------------===//
+vs.
 
-Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
-of a v4sf value.
+	xorl	%edx, %edx
+	movl	L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
+	movl	L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
+L4:
+	movw	%dx, (%esi)
+	leal	0(,%edx,4), %eax
+	movw	%ax, (%ecx)
+	addl	$1, %edx
+	cmpl	%edx, %edi
+	jne	L4
 
-//===---------------------------------------------------------------------===//
+There are 3 issues:
 
-Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
-Perhaps use pxor / xorp* to clear a XMM register first?
+1. Lack of post regalloc LICM.
+2. LSR unable to reused IV for a different type (i16 vs. i32) even though
+   the cast would be free.
 
 //===---------------------------------------------------------------------===//
 
-Better codegen for:
-
-void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
-void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
 
-For the later we generate:
+//===---------------------------------------------------------------------===//
 
-_f:
-        pxor %xmm0, %xmm0
-        movss 8(%esp), %xmm1
-        movaps %xmm0, %xmm2
-        unpcklps %xmm1, %xmm2
-        movss 4(%esp), %xmm1
-        unpcklps %xmm0, %xmm1
-        unpcklps %xmm2, %xmm1
-        movl 12(%esp), %eax
-        movaps %xmm1, (%eax)
-        ret
+mov $reg, 48(%esp)
+...
+leal 48(%esp), %eax
+mov %eax, (%esp)
+call _foo
 
-This seems like it should use shufps, one for each of a & b.
+Obviously it would have been better for the first mov (or any op) to store
+directly %esp[0] if there are no other uses.
 
 //===---------------------------------------------------------------------===//
 
@@ -676,448 +455,661 @@ We probably need some kind of target DAG combine hook to fix this.
 
 //===---------------------------------------------------------------------===//
 
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
-	movaps LCPI5_5, %xmm2
-	divps %xmm1, %xmm2
-	mulps %xmm2, %xmm3
-	mulps 8656(%ecx), %xmm3
-	addps 8672(%ecx), %xmm3
-	andps LCPI5_6, %xmm2
-	andps LCPI5_1, %xmm3
-	por %xmm2, %xmm3
-	movdqa %xmm3, (%edi)
-
-	movaps LCPI5_5, %xmm1
-	divps %xmm0, %xmm1
-	mulps %xmm1, %xmm3
-	mulps 8656(%ecx), %xmm3
-	addps 8672(%ecx), %xmm3
-	andps LCPI5_6, %xmm1
-	andps LCPI5_1, %xmm3
-	orps %xmm1, %xmm3
-	movaps %xmm3, 112(%esp)
-	movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
 
 //===---------------------------------------------------------------------===//
 
-Use movddup to splat a v2f64 directly from a memory source. e.g.
+If shorter, we should use things like:
+movzwl %ax, %eax
+instead of:
+andl $65535, %EAX
+
+The former can also be used when the two-addressy nature of the 'and' would
+require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
+
+//===---------------------------------------------------------------------===//
 
-#include <emmintrin.h>
+Consider this:
 
-void test(__m128d *r, double A) {
-  *r = _mm_set1_pd(A);
+typedef struct pair { float A, B; } pair;
+void pairtest(pair P, float *FP) {
+        *FP = P.A+P.B;
 }
 
-llc:
+We currently generate this code with llvmgcc4:
 
-_test:
-	movsd 8(%esp), %xmm0
-	unpcklpd %xmm0, %xmm0
-	movl 4(%esp), %eax
-	movapd %xmm0, (%eax)
-	ret
+_pairtest:
+        movl 8(%esp), %eax
+        movl 4(%esp), %ecx
+        movd %eax, %xmm0
+        movd %ecx, %xmm1
+        addss %xmm0, %xmm1
+        movl 12(%esp), %eax
+        movss %xmm1, (%eax)
+        ret
+
+we should be able to generate:
+_pairtest:
+        movss 4(%esp), %xmm0
+        movl 12(%esp), %eax
+        addss 8(%esp), %xmm0
+        movss %xmm0, (%eax)
+        ret
 
-icc:
+The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
+integer chunks.  It does this so that structs like {short,short} are passed in
+a single 32-bit integer stack slot.  We should handle the safe cases above much
+nicer, while still handling the hard cases.
 
-_test:
-	movl 4(%esp), %eax
-	movddup 8(%esp), %xmm0
-	movapd %xmm0, (%eax)
+While true in general, in this specific case we could do better by promoting
+load int + bitcast to float -> load fload.  This basically needs alignment info,
+the code is already implemented (but disabled) in dag combine).
+
+//===---------------------------------------------------------------------===//
+
+Another instruction selector deficiency:
+
+void %bar() {
+	%tmp = load int (int)** %foo
+	%tmp = tail call int %tmp( int 3 )
+	ret void
+}
+
+_bar:
+	subl $12, %esp
+	movl L_foo$non_lazy_ptr, %eax
+	movl (%eax), %eax
+	call *%eax
+	addl $12, %esp
 	ret
 
+The current isel scheme will not allow the load to be folded in the call since
+the load's chain result is read by the callseq_start.
+
 //===---------------------------------------------------------------------===//
 
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
+Don't forget to find a way to squash noop truncates in the JIT environment.
 
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
+//===---------------------------------------------------------------------===//
+
+Implement anyext in the same manner as truncate that would allow them to be
+eliminated.
 
 //===---------------------------------------------------------------------===//
 
-We are emitting bad code for this:
+How about implementing truncate / anyext as a property of machine instruction
+operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
+Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
+For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 
-float %test(float* %V, int %I, int %D, float %V) {
-entry:
-	%tmp = seteq int %D, 0
-	br bool %tmp, label %cond_true, label %cond_false23
-
-cond_true:
-	%tmp3 = getelementptr float* %V, int %I
-	%tmp = load float* %tmp3
-	%tmp5 = setgt float %tmp, %V
-	%tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
-	%tmp7 = or bool %tmp5, %tmp6
-	br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
-
-cond_next:
-	%tmp10 = add int %I, 1
-	%tmp12 = getelementptr float* %V, int %tmp10
-	%tmp13 = load float* %tmp12
-	%tmp15 = setle float %tmp13, %V
-	%tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
-	%tmp17 = or bool %tmp15, %tmp16
-	%retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
-	ret float %retval
-
-cond_false23:
-	%tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
-	ret float %tmp28
-
-UnifiedReturnBlock:		; preds = %cond_true
-	ret float 0.000000e+00
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+  return a * 3;
 }
 
-declare bool %llvm.isunordered.f32(float, float)
+We currently emits
+	imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+	movl	4(%esp), %eax
+	leal	(%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
 
-declare float %foo(float*, int, int, float)
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
 
+//===---------------------------------------------------------------------===//
 
-It exposes a known load folding problem:
+Implement CTTZ, CTLZ with bsf and bsr.
 
-	movss (%edx,%ecx,4), %xmm1
-	ucomiss %xmm1, %xmm0
+//===---------------------------------------------------------------------===//
 
-As well as this:
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
 
-LBB_test_2:	# cond_next
-	movss LCPI1_0, %xmm2
-	pxor %xmm3, %xmm3
-	ucomiss %xmm0, %xmm1
-	jbe LBB_test_6	# cond_next
-LBB_test_5:	# cond_next
-	movaps %xmm2, %xmm3
-LBB_test_6:	# cond_next
-	movss %xmm3, 40(%esp)
-	flds 40(%esp)
-	addl $44, %esp
-	ret
+//===---------------------------------------------------------------------===//
+
+int %foo(int* %a, int %t) {
+entry:
+        br label %cond_true
+
+cond_true:              ; preds = %cond_true, %entry
+        %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]  
+        %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
+        %tmp2 = getelementptr int* %a, int %x.0.0              
+        %tmp3 = load int* %tmp2         ; <int> [#uses=1]
+        %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
+        %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
+        %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
+        %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
+        br bool %tmp, label %bb12, label %cond_true
+
+bb12:           ; preds = %cond_true
+        ret int %tmp7
+}
+
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
 
-Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
-three moves (movss, movaps, movss).
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+  float fl = (int) (u & 0xffff);
+  float fh = (int) (u >> 16);
+  fh *= 0x1.0p16f;
+  return fh + fl;
+}
+
+00000000        subl    $0x04,%esp
+00000003        movl    0x08(%esp,1),%eax
+00000007        movl    %eax,%ecx
+00000009        shrl    $0x10,%ecx
+0000000c        cvtsi2ss        %ecx,%xmm0
+00000010        andl    $0x0000ffff,%eax
+00000015        cvtsi2ss        %eax,%xmm1
+00000019        mulss   0x00000078,%xmm0
+00000021        addss   %xmm1,%xmm0
+00000025        movss   %xmm0,(%esp,1)
+0000002a        flds    (%esp,1)
+0000002d        addl    $0x04,%esp
+00000030        ret
 
 //===---------------------------------------------------------------------===//
 
-External test Nurbs exposed some problems. Look for
-__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
-emits:
-
-        movaps    (%edx), %xmm2                                 #59.21
-        movaps    (%edx), %xmm5                                 #60.21
-        movaps    (%edx), %xmm4                                 #61.21
-        movaps    (%edx), %xmm3                                 #62.21
-        movl      40(%ecx), %ebp                                #69.49
-        shufps    $0, %xmm2, %xmm5                              #60.21
-        movl      100(%esp), %ebx                               #69.20
-        movl      (%ebx), %edi                                  #69.20
-        imull     %ebp, %edi                                    #69.49
-        addl      (%eax), %edi                                  #70.33
-        shufps    $85, %xmm2, %xmm4                             #61.21
-        shufps    $170, %xmm2, %xmm3                            #62.21
-        shufps    $255, %xmm2, %xmm2                            #63.21
-        lea       (%ebp,%ebp,2), %ebx                           #69.49
-        negl      %ebx                                          #69.49
-        lea       -3(%edi,%ebx), %ebx                           #70.33
-        shll      $4, %ebx                                      #68.37
-        addl      32(%ecx), %ebx                                #68.37
-        testb     $15, %bl                                      #91.13
-        jne       L_B1.24       # Prob 5%                       #91.13
-
-This is the llvm code after instruction scheduling:
-
-cond_next140 (0xa910740, LLVM BB @0xa90beb0):
-	%reg1078 = MOV32ri -3
-	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
-	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
-	%reg1080 = IMUL32rr %reg1079, %reg1037
-	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
-	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
-	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
-	%reg1082 = SHL32ri %reg1038, 4
-	%reg1039 = ADD32rr %reg1036, %reg1082
-	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
-	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
-	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
-	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
-	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
-	%reg1040 = MOV32rr %reg1039
-	%reg1084 = AND32ri8 %reg1039, 15
-	CMP32ri8 %reg1084, 0
-	JE mbb<cond_next204,0xa914d30>
-
-Still ok. After register allocation:
-
-cond_next140 (0xa910740, LLVM BB @0xa90beb0):
-	%EAX = MOV32ri -3
-	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
-	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
-	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
-	%EDX = MOV32rm %EDX, 1, %NOREG, 40
-	IMUL32rr %EAX<def&use>, %EDX
-	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
-	%ESI = MOV32rm %ESI, 1, %NOREG, 0
-	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
-	%EAX = LEA32r %ESI, 1, %EAX, -3
-	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
-	%ESI = MOV32rm %ESI, 1, %NOREG, 32
-	%EDI = MOV32rr %EAX
-	SHL32ri %EDI<def&use>, 4
-	ADD32rr %EDI<def&use>, %ESI
-	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
-	%XMM1 = MOVAPSrr %XMM0
-	SHUFPSrr %XMM1<def&use>, %XMM1, 170
-	%XMM2 = MOVAPSrr %XMM0
-	SHUFPSrr %XMM2<def&use>, %XMM2, 0
-	%XMM3 = MOVAPSrr %XMM0
-	SHUFPSrr %XMM3<def&use>, %XMM3, 255
-	SHUFPSrr %XMM0<def&use>, %XMM0, 85
-	%EBX = MOV32rr %EDI
-	AND32ri8 %EBX<def&use>, 15
-	CMP32ri8 %EBX, 0
-	JE mbb<cond_next204,0xa914d30>
-
-This looks really bad. The problem is shufps is a destructive opcode. Since it
-appears as operand two in more than one shufps ops. It resulted in a number of
-copies. Note icc also suffers from the same problem. Either the instruction
-selector should select pshufd or The register allocator can made the two-address
-to three-address transformation.
-
-It also exposes some other problems. See MOV32ri -3 and the spills.
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
 
 //===---------------------------------------------------------------------===//
 
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4:	# cond_true44
-	addps %xmm1, %xmm2
-	subps %xmm3, %xmm2
-	movaps (%ecx), %xmm4
-	movaps %xmm2, %xmm1
-	addps %xmm4, %xmm1
-	addl $16, %ecx
-	incl %edx
-	cmpl $262144, %edx
-	movaps %xmm3, %xmm2
-	movaps %xmm4, %xmm3
-	jne LBB_main_4	# cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
-	addps %xmm2, %xmm1    <=== Commute!
-	subps %xmm3, %xmm1
-	movaps (%ecx), %xmm4
-	movaps %xmm1, %xmm1   <=== Eliminate!
-	addps %xmm4, %xmm1
-	addl $16, %ecx
-	incl %edx
-	cmpl $262144, %edx
-	movaps %xmm3, %xmm2
-	movaps %xmm4, %xmm3
-	jne LBB_main_4	# cond_true44
+Codegen:
+
+int f(int a, int b) {
+  if (a == 4 || a == 6)
+    b++;
+  return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
 
 //===---------------------------------------------------------------------===//
 
-Consider:
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".  For example, instead of:
 
-__m128 test(float a) {
-  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+int G;
+void f(int X, int Y) {
+  G = X < 0 ? 14 : 13;
 }
 
-This compiles into:
+compiling to:
 
-movss 4(%esp), %xmm1
-mulss %xmm1, %xmm1
-xorps %xmm0, %xmm0
-movss %xmm1, %xmm0
-ret
+_f:
+        movl $14, %eax
+        movl $13, %ecx
+        movl 4(%esp), %edx
+        testl %edx, %edx
+        cmovl %eax, %ecx
+        movl %ecx, _G
+        ret
 
-Because mulss doesn't modify the top 3 elements, the top elements of 
-xmm1 are already zero'd.  We could compile this to:
+it could be:
+_f:
+        movl    4(%esp), %eax
+        sarl    $31, %eax
+        notl    %eax
+        addl    $14, %eax
+        movl    %eax, _G
+        ret
 
-movss 4(%esp), %xmm0
-mulss %xmm0, %xmm0
-ret
+etc.
 
 //===---------------------------------------------------------------------===//
 
-Here's a sick and twisted idea.  Consider code like this:
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
 
-__m128 test(__m128 a) {
-  float b = *(float*)&A;
-  ...
-  return _mm_set_ps(0.0, 0.0, 0.0, b);
+int %main() {
+entry:
+	call fastcc void %test1( )
+	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+	ret int 0
 }
 
-This might compile to this code:
+declare fastcc void %test1()
 
-movaps c(%esp), %xmm1
-xorps %xmm0, %xmm0
-movss %xmm1, %xmm0
-ret
+declare fastcc void %test2(sbyte*)
 
-Now consider if the ... code caused xmm1 to get spilled.  This might produce
-this code:
 
-movaps c(%esp), %xmm1
-movaps %xmm1, c2(%esp)
-...
+This currently compiles to:
+
+	subl $16, %esp
+	call _test5
+	addl $12, %esp
+	subl $16, %esp
+	movl $_test5, (%esp)
+	call _test6
+	addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+We currently compile sign_extend_inreg into two shifts:
 
-xorps %xmm0, %xmm0
-movaps c2(%esp), %xmm1
-movss %xmm1, %xmm0
-ret
+long foo(long X) {
+  return (long)(signed char)X;
+}
+
+becomes:
+
+_foo:
+        movl 4(%esp), %eax
+        shll $24, %eax
+        sarl $24, %eax
+        ret
+
+This could be:
+
+_foo:
+        movsbl  4(%esp),%eax
+        ret
+
+//===---------------------------------------------------------------------===//
 
-However, since the reload is only used by these instructions, we could 
-"fold" it into the uses, producing something like this:
+Consider the expansion of:
+
+uint %test3(uint %X) {
+        %tmp1 = rem uint %X, 255
+        ret uint %tmp1
+}
+
+Currently it compiles to:
 
-movaps c(%esp), %xmm1
-movaps %xmm1, c2(%esp)
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
 ...
 
-movss c2(%esp), %xmm0
-ret
+This could be "reassociated" into:
 
-... saving two instructions.
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
 
-The basic idea is that a reload from a spill slot, can, if only one 4-byte 
-chunk is used, bring in 3 zeros the the one element instead of 4 elements.
-This can be used to simplify a variety of shuffle operations, where the
-elements are fixed zeros.
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
 
 //===---------------------------------------------------------------------===//
 
-We generate significantly worse code for this than GCC:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
-http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
 
-There is also one case we do worse on PPC.
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary.  In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)                                                                          
+937  937 0x3d0a incl     %esi                           
+3    3   0x3d0b cmpb     %bl, %dl                                               
+27   27  0x3d0d jnz      0x000062db <main+11707>
 
 //===---------------------------------------------------------------------===//
 
-For this:
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
 
-#include <emmintrin.h>
-void test(__m128d *r, __m128d *A, double B) {
-  *r = _mm_loadl_pd(*A, &B);
+int f(char *p) {
+    if ((p[0] == 1) & (p[1] == 2)) return 1;
+    return 0;
 }
 
-We generates:
+//===---------------------------------------------------------------------===//
 
-	subl $12, %esp
-	movsd 24(%esp), %xmm0
-	movsd %xmm0, (%esp)
-	movl 20(%esp), %eax
-	movapd (%eax), %xmm0
-	movlpd (%esp), %xmm0
-	movl 16(%esp), %eax
-	movapd %xmm0, (%eax)
-	addl $12, %esp
-	ret
+We should inline lrintf and probably other libc functions.
 
-icc generates:
+//===---------------------------------------------------------------------===//
 
-        movl      4(%esp), %edx                                 #3.6
-        movl      8(%esp), %eax                                 #3.6
-        movapd    (%eax), %xmm0                                 #4.22
-        movlpd    12(%esp), %xmm0                               #4.8
-        movapd    %xmm0, (%edx)                                 #4.3
-        ret                                                     #5.1
+Start using the flags more.  For example, compile:
 
-So icc is smart enough to know that B is in memory so it doesn't load it and
-store it back to stack.
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x += y) == 0)
+          return a;
+     else
+          return b;
+}
 
-//===---------------------------------------------------------------------===//
+to:
+       addl    %esi, (%rdi)
+       movl    %edx, %eax
+       cmovne  %ecx, %eax
+       ret
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        movl %esi, (%rdi)
+        testl %esi, %esi
+        cmove %edx, %ecx
+        movl %ecx, %eax
+        ret
 
-__m128d test1( __m128d A, __m128d B) {
-  return _mm_shuffle_pd(A, B, 0x3);
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x + y) < 0)
+          return a;
+     else
+          return b;
 }
 
-compiles to
+to:
+
+add_zf:
+        addl    (%rdi), %esi
+        movl    %edx, %eax
+        cmovns  %ecx, %eax
+        ret
+
+instead of:
 
-shufpd $3, %xmm1, %xmm0
+_add_zf:
+        addl (%rdi), %esi
+        testl %esi, %esi
+        cmovs %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
 
-Perhaps it's better to use unpckhpd instead?
+This:
+#include <math.h>
+int foo(double X) { return isnan(X); }
 
-unpckhpd %xmm1, %xmm0
+compiles to (-m64):
 
-Don't know if unpckhpd is faster. But it is shorter.
+_foo:
+        pxor %xmm1, %xmm1
+        ucomisd %xmm1, %xmm0
+        setp %al
+        movzbl %al, %eax
+        ret
+
+the pxor is not needed, we could compare the value against itself.
 
 //===---------------------------------------------------------------------===//
 
-If shorter, we should use things like:
-movzwl %ax, %eax
-instead of:
-andl $65535, %EAX
+These two functions have identical effects:
 
-The former can also be used when the two-addressy nature of the 'and' would
-require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        movl 8(%esp), %edx
+        cmpl %edx, %ecx
+        jne LBB1_2      #UnifiedReturnBlock
+LBB1_1: #cond_true
+        addl $2, %eax
+        ret
+LBB1_2: #UnifiedReturnBlock
+        movl %ecx, %eax
+        ret
+_f2:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        cmpl 8(%esp), %ecx
+        sete %cl
+        movzbl %cl, %ecx
+        leal 1(%ecx,%eax), %eax
+        ret
+
+both of which are inferior to GCC's:
+
+_f:
+        movl    4(%esp), %edx
+        leal    1(%edx), %eax
+        addl    $2, %edx
+        cmpl    8(%esp), %eax
+        cmove   %edx, %eax
+        ret
+_f2:
+        movl    4(%esp), %eax
+        addl    $1, %eax
+        xorl    %edx, %edx
+        cmpl    8(%esp), %eax
+        sete    %dl
+        addl    %edx, %eax
+        ret
 
 //===---------------------------------------------------------------------===//
 
-This code generates ugly code, probably due to costs being off or something:
+This code:
 
-void %test(float* %P, <4 x float>* %P2 ) {
-        %xFloat0.688 = load float* %P
-        %loadVector37.712 = load <4 x float>* %P2
-        %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
-        store <4 x float> %inFloat3.713, <4 x float>* %P2
-        ret void
+void test(int X) {
+  if (X) abort();
 }
 
-Generates:
+is currently compiled to:
 
 _test:
-        pxor %xmm0, %xmm0
-        movd %xmm0, %eax        ;; EAX = 0!
-        movl 8(%esp), %ecx
-        movaps (%ecx), %xmm0
-        pinsrw $6, %eax, %xmm0
-        shrl $16, %eax          ;; EAX = 0 again!
-        pinsrw $7, %eax, %xmm0
-        movaps %xmm0, (%ecx)
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne LBB1_1
+        addl $12, %esp
         ret
+LBB1_1:
+        call L_abort$stub
 
-It would be better to generate:
+It would be better to produce:
 
 _test:
-        movl 8(%esp), %ecx
-        movaps (%ecx), %xmm0
-	xor %eax, %eax
-        pinsrw $6, %eax, %xmm0
-        pinsrw $7, %eax, %xmm0
-        movaps %xmm0, (%ecx)
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne L_abort$stub
+        addl $12, %esp
+        ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        subl $12, %esp
+        call L_abort$stub
+
+Both are useful in different situations.  Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
         ret
+LBB1_1:
+        pop %eax   # realign stack.
+        call L_abort$stub
 
-or use pxor (to make a zero vector) and shuffle (to insert it).
+Though this probably isn't worth it.
 
 //===---------------------------------------------------------------------===//
 
-Bad codegen:
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead.  For example, on X86-64, compile:
+
+int foo(int A, int B) {
+  return A+1;
+}
 
-char foo(int x) { return x; }
+to:
 
 _foo:
-	movl 4(%esp), %eax
-	shll $24, %eax
-	sarl $24, %eax
-	ret
+        leal    1(%edi), %eax
+        ret
+
+instead of:
+
+_foo:
+        incl %edi
+        movl %edi, %eax
+        ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y.  Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered.  However, this copy is not needed if the register
+;; allocator turns the shift into an LEA.  This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN:   not grep {mov E.X, E.X}
+
+%G = external global int
+
+int %test1(int %X, int %Y) {
+        %Z = add int %X, %Y
+        volatile store int %Y, int* %G
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+int %test2(int %X) {
+        %Z = add int %X, 1  ;; inc
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <xmmintrin.h>
+unsigned test(float f) {
+ return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
+}
+
+Compiles to:
+_test:
+        movss 4(%esp), %xmm0
+        movd %xmm0, %eax
+        ret
+
+it should compile to a move from the stack slot directly into eax.  DAGCombine
+has this xform, but it is currently disabled until the alignment fields of 
+the load/store nodes are trustworthy.
 
 //===---------------------------------------------------------------------===//
 
-Some useful information in the Apple Altivec / SSE Migration Guide:
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+	neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
 
-http://developer.apple.com/documentation/Performance/Conceptual/
-Accelerate_sse_migration/index.html
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+For code like:
+phi (undef, x)
+
+We get an implicit def on the undef side. If the phi is spilled, we then get:
+implicitdef xmm1
+store xmm1 -> stack
+
+It should be possible to teach the x86 backend to "fold" the store into the
+implicitdef, which just deletes the implicit def.
+
+These instructions should go away:
+#IMPLICIT_DEF %xmm1 
+movaps %xmm1, 192(%esp) 
+movaps %xmm1, 224(%esp) 
+movaps %xmm1, 176(%esp)
+
+//===---------------------------------------------------------------------===//
+
+This is a "commutable two-address" register coallescing deficiency:
+
+define <4 x float> @test1(<4 x float> %V) {
+entry:
+        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 >               ; <<4 x float>> [#uses=1]
+        %add = add <4 x float> %tmp8, %V                ; <<4 x float>> [#uses=1]
+        ret <4 x float> %add
+}
+
+this codegens to:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm0, %xmm1
+        movaps  %xmm1, %xmm0
+        ret
+
+instead of:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm1, %xmm0
+        ret
 
-e.g. SSE select using and, andnot, or. Various SSE compare translations.