Add patterns for the x86 popcnt instruction.

[oota-llvm.git] / lib / Target / X86 / README-SSE.txt
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index f96b22f1e204214aa1de06f4aabced0f4a94c078..b2116e03b148ea222532c956e9dd78f49fea3a06 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -20,7 +20,28 @@ __m128i shift_right(__m128i value, unsigned long offset) {
  //===---------------------------------------------------------------------===//
  
  SSE has instructions for doing operations on complex numbers, we should pattern
  //===---------------------------------------------------------------------===//
  
  SSE has instructions for doing operations on complex numbers, we should pattern
-match them.  Compiling this:
+match them.   For example, this should turn into a horizontal add:
+
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+  return A[0]+A[1]+A[2]+A[3];
+}
+
+Instead we get this:
+
+_f32:                                   ## @f32
+       pshufd  $1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
+       addss   %xmm0, %xmm1
+       pshufd  $3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
+       movhlps %xmm0, %xmm0            ## xmm0 = xmm0[1,1]
+       movaps  %xmm0, %xmm3
+       addss   %xmm1, %xmm3
+       movdqa  %xmm2, %xmm0
+       addss   %xmm3, %xmm0
+       ret
+
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
  
  _Complex float f32(_Complex float A, _Complex float B) {
    return A+B;
  
  _Complex float f32(_Complex float A, _Complex float B) {
    return A+B;
@@ -28,19 +49,17 @@ _Complex float f32(_Complex float A, _Complex float B) {
  
  into:
  
  
  into:
  
-_f32:
+_f32:                                   ## @f32
         movdqa  %xmm0, %xmm2
         addss   %xmm1, %xmm2
         movdqa  %xmm0, %xmm2
         addss   %xmm1, %xmm2
-       pshufd  $16, %xmm2, %xmm2
-       pshufd  $1, %xmm1, %xmm1
-       pshufd  $1, %xmm0, %xmm0
-       addss   %xmm1, %xmm0
-       pshufd  $16, %xmm0, %xmm1
-       movdqa  %xmm2, %xmm0
-       unpcklps        %xmm1, %xmm0
+       pshufd  $1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
+       pshufd  $1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
+       addss   %xmm1, %xmm3
+       movaps  %xmm2, %xmm0
+       unpcklps        %xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
         ret
  
         ret
  
-seems silly. 
+seems silly when it could just be one addps.
  
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//