X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME-SSE.txt;h=496b704ee85fb16878eb8585a77121b78497a674;hb=b8f0d89d0584e37e205c04ed5753f57a23365403;hp=7269fa2964583579690203add2888b45ad55967f;hpb=d4083d01d275de9d30b0e23a03d8bd1984c8ddab;p=oota-llvm.git diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 7269fa29645..496b704ee85 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -2,8 +2,65 @@ // Random ideas for the X86 backend: SSE-specific stuff. //===---------------------------------------------------------------------===// -- Consider eliminating the unaligned SSE load intrinsics, replacing them with - unaligned LLVM load instructions. +//===---------------------------------------------------------------------===// + +SSE Variable shift can be custom lowered to something like this, which uses a +small table + unaligned load + shuffle instead of going through memory. + +__m128i_shift_right: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + +... +__m128i shift_right(__m128i value, unsigned long offset) { + return _mm_shuffle_epi8(value, + _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); +} + +//===---------------------------------------------------------------------===// + +SSE has instructions for doing operations on complex numbers, we should pattern +match them. For example, this should turn into a horizontal add: + +typedef float __attribute__((vector_size(16))) v4f32; +float f32(v4f32 A) { + return A[0]+A[1]+A[2]+A[3]; +} + +Instead we get this: + +_f32: ## @f32 + pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] + addss %xmm0, %xmm1 + pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] + movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] + movaps %xmm0, %xmm3 + addss %xmm1, %xmm3 + movdqa %xmm2, %xmm0 + addss %xmm3, %xmm0 + ret + +Also, there are cases where some simple local SLP would improve codegen a bit. +compiling this: + +_Complex float f32(_Complex float A, _Complex float B) { + return A+B; +} + +into: + +_f32: ## @f32 + movdqa %xmm0, %xmm2 + addss %xmm1, %xmm2 + pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] + pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] + addss %xmm1, %xmm3 + movaps %xmm2, %xmm0 + unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] + ret + +seems silly when it could just be one addps. + //===---------------------------------------------------------------------===// @@ -17,7 +74,7 @@ other fast SSE modes. //===---------------------------------------------------------------------===// -Think about doing i64 math in SSE regs. +Think about doing i64 math in SSE regs on x86-32. //===---------------------------------------------------------------------===// @@ -36,68 +93,12 @@ The pattern isel got this one right. //===---------------------------------------------------------------------===// -SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction -like this: - - X += y - -and the register allocator decides to spill X, it is cheaper to emit this as: - -Y += [xslot] -store Y -> [xslot] - -than as: - -tmp = [xslot] -tmp += y -store tmp -> [xslot] - -..and this uses one fewer register (so this should be done at load folding -time, not at spiller time). *Note* however that this can only be done -if Y is dead. Here's a testcase: - -@.str_3 = external global [15 x i8] -declare void @printf(i32, ...) -define void @main() { -build_tree.exit: - br label %no_exit.i7 - -no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit - %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], - [ %tmp.34.i18, %no_exit.i7 ] - %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], - [ %tmp.28.i16, %no_exit.i7 ] - %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00 - %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00 - br i1 false, label %Compute_Tree.exit23, label %no_exit.i7 - -Compute_Tree.exit23: ; preds = %no_exit.i7 - tail call void (i32, ...)* @printf( i32 0 ) - store double %tmp.34.i18, double* null - ret void -} - -We currently emit: - -.BBmain_1: - xorpd %XMM1, %XMM1 - addsd %XMM0, %XMM1 -*** movsd %XMM2, QWORD PTR [%ESP + 8] -*** addsd %XMM2, %XMM1 -*** movsd QWORD PTR [%ESP + 8], %XMM2 - jmp .BBmain_1 # no_exit.i7 - -This is a bugpoint reduced testcase, which is why the testcase doesn't make -much sense (e.g. its an infinite loop). :) - -//===---------------------------------------------------------------------===// - SSE should implement 'select_cc' using 'emulated conditional moves' that use pcmp/pand/pandn/por to do a selection instead of a conditional branch: double %X(double %Y, double %Z, double %A, double %B) { %C = setlt double %A, %B - %z = add double %Z, 0.0 ;; select operand is not a load + %z = fadd double %Z, 0.0 ;; select operand is not a load %D = select bool %C, double %Y, double %z ret double %D } @@ -122,12 +123,6 @@ LBB_X_2: //===---------------------------------------------------------------------===// -It's not clear whether we should use pxor or xorps / xorpd to clear XMM -registers. The choice may depend on subtarget information. We should do some -more experiments on different x86 machines. - -//===---------------------------------------------------------------------===// - Lower memcpy / memset to a series of SSE 128 bit move instructions when it's feasible. @@ -151,45 +146,6 @@ Perhaps use pxor / xorp* to clear a XMM register first? //===---------------------------------------------------------------------===// -How to decide when to use the "floating point version" of logical ops? Here are -some code fragments: - - movaps LCPI5_5, %xmm2 - divps %xmm1, %xmm2 - mulps %xmm2, %xmm3 - mulps 8656(%ecx), %xmm3 - addps 8672(%ecx), %xmm3 - andps LCPI5_6, %xmm2 - andps LCPI5_1, %xmm3 - por %xmm2, %xmm3 - movdqa %xmm3, (%edi) - - movaps LCPI5_5, %xmm1 - divps %xmm0, %xmm1 - mulps %xmm1, %xmm3 - mulps 8656(%ecx), %xmm3 - addps 8672(%ecx), %xmm3 - andps LCPI5_6, %xmm1 - andps LCPI5_1, %xmm3 - orps %xmm1, %xmm3 - movaps %xmm3, 112(%esp) - movaps %xmm3, (%ebx) - -Due to some minor source change, the later case ended up using orps and movaps -instead of por and movdqa. Does it matter? - -//===---------------------------------------------------------------------===// - -X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible -to choose between movaps, movapd, and movdqa based on types of source and -destination? - -How about andps, andpd, and pand? Do we really care about the type of the packed -elements? If not, why not always use the "ps" variants which are likely to be -shorter. - -//===---------------------------------------------------------------------===// - External test Nurbs exposed some problems. Look for __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc emits: @@ -278,41 +234,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills. //===---------------------------------------------------------------------===// -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500 - -LLVM is producing bad code. - -LBB_main_4: # cond_true44 - addps %xmm1, %xmm2 - subps %xmm3, %xmm2 - movaps (%ecx), %xmm4 - movaps %xmm2, %xmm1 - addps %xmm4, %xmm1 - addl $16, %ecx - incl %edx - cmpl $262144, %edx - movaps %xmm3, %xmm2 - movaps %xmm4, %xmm3 - jne LBB_main_4 # cond_true44 - -There are two problems. 1) No need to two loop induction variables. We can -compare against 262144 * 16. 2) Known register coalescer issue. We should -be able eliminate one of the movaps: - - addps %xmm2, %xmm1 <=== Commute! - subps %xmm3, %xmm1 - movaps (%ecx), %xmm4 - movaps %xmm1, %xmm1 <=== Eliminate! - addps %xmm4, %xmm1 - addl $16, %ecx - incl %edx - cmpl $262144, %edx - movaps %xmm3, %xmm2 - movaps %xmm4, %xmm3 - jne LBB_main_4 # cond_true44 - -//===---------------------------------------------------------------------===// - Consider: __m128 test(float a) { @@ -376,74 +297,12 @@ ret ... saving two instructions. The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the the one element instead of 4 elements. +chunk is used, bring in 3 zeros the one element instead of 4 elements. This can be used to simplify a variety of shuffle operations, where the elements are fixed zeros. //===---------------------------------------------------------------------===// -For this: - -#include -void test(__m128d *r, __m128d *A, double B) { - *r = _mm_loadl_pd(*A, &B); -} - -We generates: - - subl $12, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 20(%esp), %eax - movapd (%eax), %xmm0 - movlpd (%esp), %xmm0 - movl 16(%esp), %eax - movapd %xmm0, (%eax) - addl $12, %esp - ret - -icc generates: - - movl 4(%esp), %edx #3.6 - movl 8(%esp), %eax #3.6 - movapd (%eax), %xmm0 #4.22 - movlpd 12(%esp), %xmm0 #4.8 - movapd %xmm0, (%edx) #4.3 - ret #5.1 - -So icc is smart enough to know that B is in memory so it doesn't load it and -store it back to stack. - -This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic, -lowering it to a load+insertelement instead. Already match the load+shuffle -as movlpd, so this should be easy. We already get optimal code for: - -define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) { -entry: - %tmp2 = load <2 x double>* %A, align 16 - %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0 - store <2 x double> %tmp8, <2 x double>* %r, align 16 - ret void -} - -//===---------------------------------------------------------------------===// - -__m128d test1( __m128d A, __m128d B) { - return _mm_shuffle_pd(A, B, 0x3); -} - -compiles to - -shufpd $3, %xmm1, %xmm0 - -Perhaps it's better to use unpckhpd instead? - -unpckhpd %xmm1, %xmm0 - -Don't know if unpckhpd is faster. But it is shorter. - -//===---------------------------------------------------------------------===// - This code generates ugly code, probably due to costs being off or something: define void @test(float* %P, <4 x float>* %P2 ) { @@ -505,46 +364,6 @@ nodes which are selected to max / min instructions that are marked commutable. //===---------------------------------------------------------------------===// -We should compile this: -#include -typedef union { - int i[4]; - float f[4]; - __m128 v; -} vector4_t; -void swizzle (const void *a, vector4_t * b, vector4_t * c) { - b->v = _mm_loadl_pi (b->v, (__m64 *) a); - c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1); -} - -to: - -_swizzle: - movl 4(%esp), %eax - movl 8(%esp), %edx - movl 12(%esp), %ecx - movlps (%eax), %xmm0 - movlps %xmm0, (%edx) - movlps 8(%eax), %xmm0 - movlps %xmm0, (%ecx) - ret - -not: - -swizzle: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - movl 4(%esp), %ecx - movlps (%ecx), %xmm0 - movaps %xmm0, (%eax) - movl 12(%esp), %eax - movaps (%eax), %xmm0 - movlps 8(%ecx), %xmm0 - movaps %xmm0, (%eax) - ret - -//===---------------------------------------------------------------------===// - We should materialize vector constants like "all ones" and "signbit" with code like: @@ -631,10 +450,11 @@ eliminates a constant pool load. For example, consider: define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { entry: - %tmp6 = sub float -0.000000e+00, %z.1 ; [#uses=1] + %tmp6 = fsub float -0.000000e+00, %z.1 ; [#uses=1] %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly ret i64 %tmp20 } +declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly This currently compiles to: @@ -814,7 +634,7 @@ define <4 x i32> @f(<4 x i32> %i) nounwind { ret <4 x i32> %A } -Compiles into: +On targets without SSE4.1, this compiles into: LCPI1_0: ## <4 x i32> .long 10 @@ -846,6 +666,11 @@ _f: punpckldq %xmm2, %xmm0 ret +It would be better to synthesize integer vector multiplication by constants +using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, +simple cases such as multiplication by powers of two would be better as +vector shifts than as multiplications. + //===---------------------------------------------------------------------===// We compile this: @@ -889,3 +714,251 @@ LC0: With SSE4, it should be movdqa .LC0(%rip), %xmm0 pinsrb $6, %edi, %xmm0 + +//===---------------------------------------------------------------------===// + +We should transform a shuffle of two vectors of constants into a single vector +of constants. Also, insertelement of a constant into a vector of constants +should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. + +We compiled it to something horrible: + + .align 4 +LCPI1_1: ## float + .long 1065353216 ## float 1 + .const + + .align 4 +LCPI1_0: ## <4 x float> + .space 4 + .long 1065353216 ## float 1 + .space 4 + .long 1065353216 ## float 1 + .text + .align 4,0x90 + .globl _t +_t: + xorps %xmm0, %xmm0 + movhps LCPI1_0, %xmm0 + movss LCPI1_1, %xmm1 + movaps %xmm0, %xmm2 + shufps $2, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, 0 + +//===---------------------------------------------------------------------===// +rdar://5907648 + +This function: + +float foo(unsigned char x) { + return x; +} + +compiles to (x86-32): + +define float @foo(i8 zeroext %x) nounwind { + %tmp12 = uitofp i8 %x to float ; [#uses=1] + ret float %tmp12 +} + +compiles to: + +_foo: + subl $4, %esp + movzbl 8(%esp), %eax + cvtsi2ss %eax, %xmm0 + movss %xmm0, (%esp) + flds (%esp) + addl $4, %esp + ret + +We should be able to use: + cvtsi2ss 8($esp), %xmm0 +since we know the stack slot is already zext'd. + +//===---------------------------------------------------------------------===// + +Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) +when code size is critical. movlps is slower than movsd on core2 but it's one +byte shorter. + +//===---------------------------------------------------------------------===// + +We should use a dynamic programming based approach to tell when using FPStack +operations is cheaper than SSE. SciMark montecarlo contains code like this +for example: + +double MonteCarlo_num_flops(int Num_samples) { + return ((double) Num_samples)* 4.0; +} + +In fpstack mode, this compiles into: + +LCPI1_0: + .long 1082130432 ## float 4.000000e+00 +_MonteCarlo_num_flops: + subl $4, %esp + movl 8(%esp), %eax + movl %eax, (%esp) + fildl (%esp) + fmuls LCPI1_0 + addl $4, %esp + ret + +in SSE mode, it compiles into significantly slower code: + +_MonteCarlo_num_flops: + subl $12, %esp + cvtsi2sd 16(%esp), %xmm0 + mulsd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + fldl (%esp) + addl $12, %esp + ret + +There are also other cases in scimark where using fpstack is better, it is +cheaper to do fld1 than load from a constant pool for example, so +"load, add 1.0, store" is better done in the fp stack, etc. + +//===---------------------------------------------------------------------===// + +The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to +"cmpsd". For example, this code: + +double d1(double x) { return x == x ? x : x + x; } + +Compiles into: + +_d1: + ucomisd %xmm0, %xmm0 + jnp LBB1_2 + addsd %xmm0, %xmm0 + ret +LBB1_2: + ret + +Also, the 'ret's should be shared. This is PR6032. + +//===---------------------------------------------------------------------===// + +These should compile into the same code (PR6214): Perhaps instcombine should +canonicalize the former into the later? + +define float @foo(float %x) nounwind { + %t = bitcast float %x to i32 + %s = and i32 %t, 2147483647 + %d = bitcast i32 %s to float + ret float %d +} + +declare float @fabsf(float %n) +define float @bar(float %x) nounwind { + %d = call float @fabsf(float %x) + ret float %d +} + +//===---------------------------------------------------------------------===// + +This IR (from PR6194): + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin10.0.0" + +%0 = type { double, double } +%struct.float3 = type { float, float, float } + +define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { +entry: + %tmp18 = extractvalue %0 %0, 0 ; [#uses=1] + %tmp19 = bitcast double %tmp18 to i64 ; [#uses=1] + %tmp20 = zext i64 %tmp19 to i128 ; [#uses=1] + %tmp10 = lshr i128 %tmp20, 32 ; [#uses=1] + %tmp11 = trunc i128 %tmp10 to i32 ; [#uses=1] + %tmp12 = bitcast i32 %tmp11 to float ; [#uses=1] + %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; [#uses=1] + store float %tmp12, float* %tmp5 + ret void +} + +Compiles to: + +_test: ## @test + movd %xmm0, %rax + shrq $32, %rax + movl %eax, 4(%rdi) + ret + +This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and +doing a shuffle from v[1] to v[0] then a float store. + +//===---------------------------------------------------------------------===// + +On SSE4 machines, we compile this code: + +define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, + <2 x float> *%P) nounwind { + %Z = fadd <2 x float> %Q, %R + + store <2 x float> %Z, <2 x float> *%P + ret <2 x float> %Z +} + +into: + +_test2: ## @test2 +## BB#0: + insertps $0, %xmm2, %xmm2 + insertps $16, %xmm3, %xmm2 + insertps $0, %xmm0, %xmm3 + insertps $16, %xmm1, %xmm3 + addps %xmm2, %xmm3 + movq %xmm3, (%rdi) + movaps %xmm3, %xmm0 + pshufd $1, %xmm3, %xmm1 + ## kill: XMM1 XMM1 + ret + +The insertps's of $0 are pointless complex copies. + +//===---------------------------------------------------------------------===// + +[UNSAFE FP] + +void foo(double, double, double); +void norm(double x, double y, double z) { + double scale = __builtin_sqrt(x*x + y*y + z*z); + foo(x/scale, y/scale, z/scale); +} + +We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is +slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first +and emit 3 mulsd in place of the divs. This can be done as a target-independent +transform. + +If we're dealing with floats instead of doubles we could even replace the sqrtss +and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the +cost of reduced accuracy. + +//===---------------------------------------------------------------------===// + +This function should be matched to haddpd when the appropriate CPU is enabled: + +#include +double f (__m128d p) { + return p[0] + p[1]; +} + +similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should +turn into hsubpd also. + +//===---------------------------------------------------------------------===// + +define <2 x i32> @foo(<2 x double> %in) { + %x = fptosi <2 x double> %in to <2 x i32> + ret <2 x i32> %x +} + +Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si. + +//===---------------------------------------------------------------------===//