+We compile vector multiply-by-constant into poor code:
+
+define <4 x i32> @f(<4 x i32> %i) nounwind {
+ %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
+ ret <4 x i32> %A
+}
+
+On targets without SSE4.1, this compiles into:
+
+LCPI1_0: ## <4 x i32>
+ .long 10
+ .long 10
+ .long 10
+ .long 10
+ .text
+ .align 4,0x90
+ .globl _f
+_f:
+ pshufd $3, %xmm0, %xmm1
+ movd %xmm1, %eax
+ imull LCPI1_0+12, %eax
+ movd %eax, %xmm1
+ pshufd $1, %xmm0, %xmm2
+ movd %xmm2, %eax
+ imull LCPI1_0+4, %eax
+ movd %eax, %xmm2
+ punpckldq %xmm1, %xmm2
+ movd %xmm0, %eax
+ imull LCPI1_0, %eax
+ movd %eax, %xmm1
+ movhlps %xmm0, %xmm0
+ movd %xmm0, %eax
+ imull LCPI1_0+8, %eax
+ movd %eax, %xmm0
+ punpckldq %xmm0, %xmm1
+ movaps %xmm1, %xmm0
+ punpckldq %xmm2, %xmm0
+ ret
+
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+__m128i
+foo2 (char x)
+{
+ return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
+}
+
+into:
+ movl $1, %eax
+ xorps %xmm0, %xmm0
+ pinsrw $2, %eax, %xmm0
+ movzbl 4(%esp), %eax
+ pinsrw $3, %eax, %xmm0
+ movl $256, %eax
+ pinsrw $7, %eax, %xmm0
+ ret
+
+
+gcc-4.2:
+ subl $12, %esp
+ movzbl 16(%esp), %eax
+ movdqa LC0, %xmm0
+ pinsrw $3, %eax, %xmm0
+ addl $12, %esp
+ ret
+ .const
+ .align 4
+LC0:
+ .word 0
+ .word 0
+ .word 1
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+ .word 256
+
+With SSE4, it should be
+ movdqa .LC0(%rip), %xmm0
+ pinsrb $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+ .align 4
+LCPI1_1: ## float
+ .long 1065353216 ## float 1
+ .const
+
+ .align 4
+LCPI1_0: ## <4 x float>
+ .space 4
+ .long 1065353216 ## float 1
+ .space 4
+ .long 1065353216 ## float 1
+ .text
+ .align 4,0x90
+ .globl _t
+_t:
+ xorps %xmm0, %xmm0
+ movhps LCPI1_0, %xmm0
+ movss LCPI1_1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $2, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+ return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext %x) nounwind {
+ %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
+ ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+ subl $4, %esp
+ movzbl 8(%esp), %eax
+ cvtsi2ss %eax, %xmm0
+ movss %xmm0, (%esp)
+ flds (%esp)
+ addl $4, %esp
+ ret
+
+We should be able to use:
+ cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE. SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+ return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:
+ .long 1082130432 ## float 4.000000e+00
+_MonteCarlo_num_flops:
+ subl $4, %esp
+ movl 8(%esp), %eax
+ movl %eax, (%esp)
+ fildl (%esp)
+ fmuls LCPI1_0
+ addl $4, %esp
+ ret
+
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+ subl $12, %esp
+ cvtsi2sd 16(%esp), %xmm0
+ mulsd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
+"cmpsd". For example, this code:
+
+double d1(double x) { return x == x ? x : x + x; }
+
+Compiles into:
+
+_d1:
+ ucomisd %xmm0, %xmm0
+ jnp LBB1_2
+ addsd %xmm0, %xmm0
+ ret
+LBB1_2:
+ ret
+
+Also, the 'ret's should be shared. This is PR6032.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+ %t = bitcast float %x to i32
+ %s = and i32 %t, 2147483647
+ %d = bitcast i32 %s to float
+ ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+ %d = call float @fabsf(float %x)
+ ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {