merge the common darwin settings from the X86/PPC/ARM targets

[oota-llvm.git] / lib / Target / X86 / README-SSE.txt
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index ad28248ef735b459f157c187e212e96a72f49976..71ad51c7984eb5ee07cd9563cb261ac35aff302d 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -17,7 +17,7 @@ other fast SSE modes.
  
  //===---------------------------------------------------------------------===//
  
-Think about doing i64 math in SSE regs.
+Think about doing i64 math in SSE regs on x86-32.
  
  //===---------------------------------------------------------------------===//
  
@@ -808,3 +808,111 @@ LC0:
  With SSE4, it should be
        movdqa  .LC0(%rip), %xmm0
        pinsrb  $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+       .align  4
+LCPI1_1:                                       ##  float
+       .long   1065353216      ## float 1
+       .const
+
+       .align  4
+LCPI1_0:                                       ##  <4 x float>
+       .space  4
+       .long   1065353216      ## float 1
+       .space  4
+       .long   1065353216      ## float 1
+       .text
+       .align  4,0x90
+       .globl  _t
+_t:
+       xorps   %xmm0, %xmm0
+       movhps  LCPI1_0, %xmm0
+       movss   LCPI1_1, %xmm1
+       movaps  %xmm0, %xmm2
+       shufps  $2, %xmm1, %xmm2
+       shufps  $132, %xmm2, %xmm0
+       movaps  %xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+  return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext  %x) nounwind  {
+       %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
+       ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+       subl    $4, %esp
+       movzbl  8(%esp), %eax
+       cvtsi2ss        %eax, %xmm0
+       movss   %xmm0, (%esp)
+       flds    (%esp)
+       addl    $4, %esp
+       ret
+
+We should be able to use:
+  cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE.  SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+    return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:                                       
+       .long   1082130432      ## float 4.000000e+00
+_MonteCarlo_num_flops:
+       subl    $4, %esp
+       movl    8(%esp), %eax
+       movl    %eax, (%esp)
+       fildl   (%esp)
+       fmuls   LCPI1_0
+       addl    $4, %esp
+       ret
+        
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+       subl    $12, %esp
+       cvtsi2sd        16(%esp), %xmm0
+       mulsd   LCPI1_0, %xmm0
+       movsd   %xmm0, (%esp)
+       fldl    (%esp)
+       addl    $12, %esp
+       ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//