X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME-SSE.txt;h=40110353fc6237ade72586ab2f1e3e2ed8f507c7;hb=7fd324a31fbfd237f43d38d3a780a19fbf909ba3;hp=b4fc53ad0aac29c3c3fead2ce37f7655b5a8a530;hpb=0b0102b172c38dca3b7384225257aa5e9e1ee777;p=oota-llvm.git diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index b4fc53ad0aa..40110353fc6 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -2,8 +2,65 @@ // Random ideas for the X86 backend: SSE-specific stuff. //===---------------------------------------------------------------------===// -- Consider eliminating the unaligned SSE load intrinsics, replacing them with - unaligned LLVM load instructions. +//===---------------------------------------------------------------------===// + +SSE Variable shift can be custom lowered to something like this, which uses a +small table + unaligned load + shuffle instead of going through memory. + +__m128i_shift_right: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + +... +__m128i shift_right(__m128i value, unsigned long offset) { + return _mm_shuffle_epi8(value, + _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); +} + +//===---------------------------------------------------------------------===// + +SSE has instructions for doing operations on complex numbers, we should pattern +match them. For example, this should turn into a horizontal add: + +typedef float __attribute__((vector_size(16))) v4f32; +float f32(v4f32 A) { + return A[0]+A[1]+A[2]+A[3]; +} + +Instead we get this: + +_f32: ## @f32 + pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] + addss %xmm0, %xmm1 + pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] + movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] + movaps %xmm0, %xmm3 + addss %xmm1, %xmm3 + movdqa %xmm2, %xmm0 + addss %xmm3, %xmm0 + ret + +Also, there are cases where some simple local SLP would improve codegen a bit. +compiling this: + +_Complex float f32(_Complex float A, _Complex float B) { + return A+B; +} + +into: + +_f32: ## @f32 + movdqa %xmm0, %xmm2 + addss %xmm1, %xmm2 + pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] + pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] + addss %xmm1, %xmm3 + movaps %xmm2, %xmm0 + unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] + ret + +seems silly when it could just be one addps. + //===---------------------------------------------------------------------===// @@ -17,7 +74,7 @@ other fast SSE modes. //===---------------------------------------------------------------------===// -Think about doing i64 math in SSE regs. +Think about doing i64 math in SSE regs on x86-32. //===---------------------------------------------------------------------===// @@ -36,65 +93,12 @@ The pattern isel got this one right. //===---------------------------------------------------------------------===// -SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction -like this: - - X += y - -and the register allocator decides to spill X, it is cheaper to emit this as: - -Y += [xslot] -store Y -> [xslot] - -than as: - -tmp = [xslot] -tmp += y -store tmp -> [xslot] - -..and this uses one fewer register (so this should be done at load folding -time, not at spiller time). *Note* however that this can only be done -if Y is dead. Here's a testcase: - -%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0] -implementation ; Functions: -declare void %printf(int, ...) -void %main() { -build_tree.exit: - br label %no_exit.i7 -no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit - %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; [#uses=1] - %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; [#uses=1] - %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00 - %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00 - br bool false, label %Compute_Tree.exit23, label %no_exit.i7 -Compute_Tree.exit23: ; preds = %no_exit.i7 - tail call void (int, ...)* %printf( int 0 ) - store double %tmp.34.i18, double* null - ret void -} - -We currently emit: - -.BBmain_1: - xorpd %XMM1, %XMM1 - addsd %XMM0, %XMM1 -*** movsd %XMM2, QWORD PTR [%ESP + 8] -*** addsd %XMM2, %XMM1 -*** movsd QWORD PTR [%ESP + 8], %XMM2 - jmp .BBmain_1 # no_exit.i7 - -This is a bugpoint reduced testcase, which is why the testcase doesn't make -much sense (e.g. its an infinite loop). :) - -//===---------------------------------------------------------------------===// - SSE should implement 'select_cc' using 'emulated conditional moves' that use pcmp/pand/pandn/por to do a selection instead of a conditional branch: double %X(double %Y, double %Z, double %A, double %B) { %C = setlt double %A, %B - %z = add double %Z, 0.0 ;; select operand is not a load + %z = fadd double %Z, 0.0 ;; select operand is not a load %D = select bool %C, double %Y, double %z ret double %D } @@ -119,62 +123,11 @@ LBB_X_2: //===---------------------------------------------------------------------===// -It's not clear whether we should use pxor or xorps / xorpd to clear XMM -registers. The choice may depend on subtarget information. We should do some -more experiments on different x86 machines. - -//===---------------------------------------------------------------------===// - -Currently the x86 codegen isn't very good at mixing SSE and FPStack -code: - -unsigned int foo(double x) { return x; } - -foo: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl (%esp), %eax - addl $20, %esp - ret - -This will be solved when we go to a dynamic programming based isel. - -//===---------------------------------------------------------------------===// - Lower memcpy / memset to a series of SSE 128 bit move instructions when it's feasible. //===---------------------------------------------------------------------===// -Teach the coalescer to commute 2-addr instructions, allowing us to eliminate -the reg-reg copy in this example: - -float foo(int *x, float *y, unsigned c) { - float res = 0.0; - unsigned i; - for (i = 0; i < c; i++) { - float xx = (float)x[i]; - xx = xx * y[i]; - xx += res; - res = xx; - } - return res; -} - -LBB_foo_3: # no_exit - cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI] - mulss %XMM0, DWORD PTR [%EAX + 4*%ESI] - addss %XMM0, %XMM1 - inc %ESI - cmp %ESI, %ECX -**** movaps %XMM1, %XMM0 - jb LBB_foo_3 # no_exit - -//===---------------------------------------------------------------------===// - Codegen: if (copysign(1.0, x) == copysign(1.0, y)) into: @@ -193,45 +146,6 @@ Perhaps use pxor / xorp* to clear a XMM register first? //===---------------------------------------------------------------------===// -How to decide when to use the "floating point version" of logical ops? Here are -some code fragments: - - movaps LCPI5_5, %xmm2 - divps %xmm1, %xmm2 - mulps %xmm2, %xmm3 - mulps 8656(%ecx), %xmm3 - addps 8672(%ecx), %xmm3 - andps LCPI5_6, %xmm2 - andps LCPI5_1, %xmm3 - por %xmm2, %xmm3 - movdqa %xmm3, (%edi) - - movaps LCPI5_5, %xmm1 - divps %xmm0, %xmm1 - mulps %xmm1, %xmm3 - mulps 8656(%ecx), %xmm3 - addps 8672(%ecx), %xmm3 - andps LCPI5_6, %xmm1 - andps LCPI5_1, %xmm3 - orps %xmm1, %xmm3 - movaps %xmm3, 112(%esp) - movaps %xmm3, (%ebx) - -Due to some minor source change, the later case ended up using orps and movaps -instead of por and movdqa. Does it matter? - -//===---------------------------------------------------------------------===// - -X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible -to choose between movaps, movapd, and movdqa based on types of source and -destination? - -How about andps, andpd, and pand? Do we really care about the type of the packed -elements? If not, why not always use the "ps" variants which are likely to be -shorter. - -//===---------------------------------------------------------------------===// - External test Nurbs exposed some problems. Look for __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc emits: @@ -320,41 +234,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills. //===---------------------------------------------------------------------===// -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500 - -LLVM is producing bad code. - -LBB_main_4: # cond_true44 - addps %xmm1, %xmm2 - subps %xmm3, %xmm2 - movaps (%ecx), %xmm4 - movaps %xmm2, %xmm1 - addps %xmm4, %xmm1 - addl $16, %ecx - incl %edx - cmpl $262144, %edx - movaps %xmm3, %xmm2 - movaps %xmm4, %xmm3 - jne LBB_main_4 # cond_true44 - -There are two problems. 1) No need to two loop induction variables. We can -compare against 262144 * 16. 2) Known register coalescer issue. We should -be able eliminate one of the movaps: - - addps %xmm2, %xmm1 <=== Commute! - subps %xmm3, %xmm1 - movaps (%ecx), %xmm4 - movaps %xmm1, %xmm1 <=== Eliminate! - addps %xmm4, %xmm1 - addl $16, %ecx - incl %edx - cmpl $262144, %edx - movaps %xmm3, %xmm2 - movaps %xmm4, %xmm3 - jne LBB_main_4 # cond_true44 - -//===---------------------------------------------------------------------===// - Consider: __m128 test(float a) { @@ -418,68 +297,18 @@ ret ... saving two instructions. The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the the one element instead of 4 elements. +chunk is used, bring in 3 zeros the one element instead of 4 elements. This can be used to simplify a variety of shuffle operations, where the elements are fixed zeros. //===---------------------------------------------------------------------===// -For this: - -#include -void test(__m128d *r, __m128d *A, double B) { - *r = _mm_loadl_pd(*A, &B); -} - -We generates: - - subl $12, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 20(%esp), %eax - movapd (%eax), %xmm0 - movlpd (%esp), %xmm0 - movl 16(%esp), %eax - movapd %xmm0, (%eax) - addl $12, %esp - ret - -icc generates: - - movl 4(%esp), %edx #3.6 - movl 8(%esp), %eax #3.6 - movapd (%eax), %xmm0 #4.22 - movlpd 12(%esp), %xmm0 #4.8 - movapd %xmm0, (%edx) #4.3 - ret #5.1 - -So icc is smart enough to know that B is in memory so it doesn't load it and -store it back to stack. - -//===---------------------------------------------------------------------===// - -__m128d test1( __m128d A, __m128d B) { - return _mm_shuffle_pd(A, B, 0x3); -} - -compiles to - -shufpd $3, %xmm1, %xmm0 - -Perhaps it's better to use unpckhpd instead? - -unpckhpd %xmm1, %xmm0 - -Don't know if unpckhpd is faster. But it is shorter. - -//===---------------------------------------------------------------------===// - This code generates ugly code, probably due to costs being off or something: -void %test(float* %P, <4 x float>* %P2 ) { +define void @test(float* %P, <4 x float>* %P2 ) { %xFloat0.688 = load float* %P - %loadVector37.712 = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3 + %tmp = load <4 x float>* %P2 + %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 store <4 x float> %inFloat3.713, <4 x float>* %P2 ret void } @@ -487,17 +316,16 @@ void %test(float* %P, <4 x float>* %P2 ) { Generates: _test: - pxor %xmm0, %xmm0 - movd %xmm0, %eax ;; EAX = 0! - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - pinsrw $6, %eax, %xmm0 - shrl $16, %eax ;; EAX = 0 again! - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret + movl 8(%esp), %eax + movaps (%eax), %xmm0 + pxor %xmm1, %xmm1 + movaps %xmm0, %xmm2 + shufps $50, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, (%eax) + ret -It would be better to generate: +Would it be better to generate: _test: movl 8(%esp), %ecx @@ -508,7 +336,7 @@ _test: movaps %xmm0, (%ecx) ret -or use pxor (to make a zero vector) and shuffle (to insert it). +? //===---------------------------------------------------------------------===// @@ -536,223 +364,592 @@ nodes which are selected to max / min instructions that are marked commutable. //===---------------------------------------------------------------------===// -We should compile this: +We should materialize vector constants like "all ones" and "signbit" with +code like: + + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + +and: + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + psrlq xmm1, 31 ; xmm1 = all 100000000000... + +instead of using a load from the constant pool. The later is important for +ABS/NEG/copysign etc. + +//===---------------------------------------------------------------------===// + +These functions: + #include -typedef union { - int i[4]; - float f[4]; - __m128 v; -} vector4_t; -void swizzle (const void *a, vector4_t * b, vector4_t * c) { - b->v = _mm_loadl_pi (b->v, (__m64 *) a); - c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1); +__m128i a; +void x(unsigned short n) { + a = _mm_slli_epi32 (a, n); +} +void y(unsigned n) { + a = _mm_slli_epi32 (a, n); } -to: - -_swizzle: - movl 4(%esp), %eax - movl 8(%esp), %edx - movl 12(%esp), %ecx - movlps (%eax), %xmm0 - movlps %xmm0, (%edx) - movlps 8(%eax), %xmm0 - movlps %xmm0, (%ecx) +compile to ( -O3 -static -fomit-frame-pointer): +_x: + movzwl 4(%esp), %eax + movd %eax, %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a ret - -not: - -swizzle: - movl 8(%esp), %eax - movaps (%eax), %xmm0 - movl 4(%esp), %ecx - movlps (%ecx), %xmm0 - movaps %xmm0, (%eax) - movl 12(%esp), %eax - movaps (%eax), %xmm0 - movlps 8(%ecx), %xmm0 - movaps %xmm0, (%eax) +_y: + movd 4(%esp), %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a ret -//===---------------------------------------------------------------------===// +"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems +like movd would be sufficient in both cases as the value is already zero +extended in the 32-bit stack slot IIRC. For signed short, it should also be +save, as a really-signed value would be undefined for pslld. -This code: -#include -__m128i test(long long i) { return _mm_cvtsi64x_si128(i); } +//===---------------------------------------------------------------------===// -Should turn into a single 'movq %rdi, %xmm0' instruction. Instead, we -get this (on x86-64): +#include +int t1(double d) { return signbit(d); } -_test: - movd %rdi, %xmm1 - xorps %xmm0, %xmm0 - movsd %xmm1, %xmm0 +This currently compiles to: + subl $12, %esp + movsd 16(%esp), %xmm0 + movsd %xmm0, (%esp) + movl 4(%esp), %eax + shrl $31, %eax + addl $12, %esp ret -The LLVM IR is: +We should use movmskp{s|d} instead. + +//===---------------------------------------------------------------------===// + +CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single +(aligned) vector load. This functionality has a couple of problems. + +1. The code to infer alignment from loads of globals is in the X86 backend, + not the dag combiner. This is because dagcombine2 needs to be able to see + through the X86ISD::Wrapper node, which DAGCombine can't really do. +2. The code for turning 4 x load into a single vector load is target + independent and should be moved to the dag combiner. +3. The code for turning 4 x load into a vector load can only handle a direct + load from a global or a direct load from the stack. It should be generalized + to handle any load from P, P+4, P+8, P+12, where P can be anything. +4. The alignment inference code cannot handle loads from globals in non-static + mode because it doesn't look through the extra dyld stub load. If you try + vec_align.ll without -relocation-model=static, you'll see what I mean. + +//===---------------------------------------------------------------------===// + +We should lower store(fneg(load p), q) into an integer load+xor+store, which +eliminates a constant pool load. For example, consider: -target triple = "x86_64-apple-darwin8" -define <2 x i64> @test(i64 %i) { +define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { entry: - %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 - %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 - ret <2 x i64> %tmp11 + %tmp6 = fsub float -0.000000e+00, %z.1 ; [#uses=1] + %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly + ret i64 %tmp20 } +declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly + +This currently compiles to: + +LCPI1_0: # <4 x float> + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 +_ccosf: + subl $12, %esp + movss 16(%esp), %xmm0 + movss %xmm0, 4(%esp) + movss 20(%esp), %xmm0 + xorps LCPI1_0, %xmm0 + movss %xmm0, (%esp) + call L_ccoshf$stub + addl $12, %esp + ret + +Note the load into xmm0, then xor (to negate), then store. In PIC mode, +this code computes the pic base and does two loads to do the constant pool +load, so the improvement is much bigger. + +The tricky part about this xform is that the argument load/store isn't exposed +until post-legalize, and at that point, the fneg has been custom expanded into +an X86 fxor. This means that we need to handle this case in the x86 backend +instead of in target independent code. + +//===---------------------------------------------------------------------===// + +Non-SSE4 insert into 16 x i8 is atrociously bad. + +//===---------------------------------------------------------------------===// + +<2 x i64> extract is substantially worse than <2 x f64>, even if the destination +is memory. //===---------------------------------------------------------------------===// -These functions should produce the same code: +SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext +sitting between the truncate and the extract. + +//===---------------------------------------------------------------------===// + +INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert +any number of 0.0 simultaneously. Currently we only use it for simple +insertions. + +See comments in LowerINSERT_VECTOR_ELT_SSE4. + +//===---------------------------------------------------------------------===// + +On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not +Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are +legal, it'll just take a few extra patterns written in the .td file. + +Note: this is not a code quality issue; the custom lowered code happens to be +right, but we shouldn't have to custom lower anything. This is probably related +to <2 x i64> ops being so bad. + +//===---------------------------------------------------------------------===// +'select' on vectors and scalars could be a whole lot better. We currently +lower them to conditional branches. On x86-64 for example, we compile this: + +double test(double a, double b, double c, double d) { return a -typedef long long __m128i __attribute__ ((__vector_size__ (16))); +typedef short vSInt16 __attribute__ ((__vector_size__ (16))); + +static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, +- 22725, - 12873};; -int foo(__m128i* val) { - return __builtin_ia32_vec_ext_v4si(*val, 1); +vSInt16 madd(vSInt16 b) +{ + return _mm_madd_epi16(a, b); } -int bar(__m128i* val) { - union vs { - __m128i *_v; - int* _s; - } v = {val}; - return v._s[1]; + +Generated code (x86-32, linux): +madd: + pushl %ebp + movl %esp, %ebp + andl $-16, %esp + movaps .LCPI1_0, %xmm1 + pmaddwd %xmm1, %xmm0 + movl %ebp, %esp + popl %ebp + ret + +//===---------------------------------------------------------------------===// + +Consider: +#include +__m128 foo2 (float x) { + return _mm_set_ps (0, 0, x, 0); } -We currently produce (with -m64): +In x86-32 mode, we generate this spiffy code: -_foo: - pshufd $1, (%rdi), %xmm0 - movd %xmm0, %eax - ret -_bar: - movl 4(%rdi), %eax - ret +_foo2: + movss 4(%esp), %xmm0 + pshufd $81, %xmm0, %xmm0 + ret + +in x86-64 mode, we generate this code, which could be better: + +_foo2: + xorps %xmm1, %xmm1 + movss %xmm0, %xmm1 + pshufd $81, %xmm1, %xmm0 + ret + +In sse4 mode, we could use insertps to make both better. + +Here's another testcase that could use insertps [mem]: + +#include +extern float x2, x3; +__m128 foo1 (float x1, float x4) { + return _mm_set_ps (x2, x1, x3, x4); +} + +gcc mainline compiles it to: + +foo1: + insertps $0x10, x2(%rip), %xmm0 + insertps $0x10, x3(%rip), %xmm1 + movaps %xmm1, %xmm2 + movlhps %xmm0, %xmm2 + movaps %xmm2, %xmm0 + ret //===---------------------------------------------------------------------===// -We should materialize vector constants like "all ones" and "signbit" with -code like: +We compile vector multiply-by-constant into poor code: - cmpeqps xmm1, xmm1 ; xmm1 = all-ones +define <4 x i32> @f(<4 x i32> %i) nounwind { + %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > + ret <4 x i32> %A +} -and: - cmpeqps xmm1, xmm1 ; xmm1 = all-ones - psrlq xmm1, 31 ; xmm1 = all 100000000000... +On targets without SSE4.1, this compiles into: + +LCPI1_0: ## <4 x i32> + .long 10 + .long 10 + .long 10 + .long 10 + .text + .align 4,0x90 + .globl _f +_f: + pshufd $3, %xmm0, %xmm1 + movd %xmm1, %eax + imull LCPI1_0+12, %eax + movd %eax, %xmm1 + pshufd $1, %xmm0, %xmm2 + movd %xmm2, %eax + imull LCPI1_0+4, %eax + movd %eax, %xmm2 + punpckldq %xmm1, %xmm2 + movd %xmm0, %eax + imull LCPI1_0, %eax + movd %eax, %xmm1 + movhlps %xmm0, %xmm0 + movd %xmm0, %eax + imull LCPI1_0+8, %eax + movd %eax, %xmm0 + punpckldq %xmm0, %xmm1 + movaps %xmm1, %xmm0 + punpckldq %xmm2, %xmm0 + ret -instead of using a load from the constant pool. The later is important for -ABS/NEG/copysign etc. +It would be better to synthesize integer vector multiplication by constants +using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, +simple cases such as multiplication by powers of two would be better as +vector shifts than as multiplications. //===---------------------------------------------------------------------===// -"converting 64-bit constant pool entry to 32-bit not necessarily beneficial" -http://llvm.org/PR1264 +We compile this: + +__m128i +foo2 (char x) +{ + return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); +} + +into: + movl $1, %eax + xorps %xmm0, %xmm0 + pinsrw $2, %eax, %xmm0 + movzbl 4(%esp), %eax + pinsrw $3, %eax, %xmm0 + movl $256, %eax + pinsrw $7, %eax, %xmm0 + ret + + +gcc-4.2: + subl $12, %esp + movzbl 16(%esp), %eax + movdqa LC0, %xmm0 + pinsrw $3, %eax, %xmm0 + addl $12, %esp + ret + .const + .align 4 +LC0: + .word 0 + .word 0 + .word 1 + .word 0 + .word 0 + .word 0 + .word 0 + .word 256 + +With SSE4, it should be + movdqa .LC0(%rip), %xmm0 + pinsrb $6, %edi, %xmm0 + +//===---------------------------------------------------------------------===// + +We should transform a shuffle of two vectors of constants into a single vector +of constants. Also, insertelement of a constant into a vector of constants +should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. + +We compiled it to something horrible: + + .align 4 +LCPI1_1: ## float + .long 1065353216 ## float 1 + .const + + .align 4 +LCPI1_0: ## <4 x float> + .space 4 + .long 1065353216 ## float 1 + .space 4 + .long 1065353216 ## float 1 + .text + .align 4,0x90 + .globl _t +_t: + xorps %xmm0, %xmm0 + movhps LCPI1_0, %xmm0 + movss LCPI1_1, %xmm1 + movaps %xmm0, %xmm2 + shufps $2, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, 0 + +//===---------------------------------------------------------------------===// +rdar://5907648 + +This function: + +float foo(unsigned char x) { + return x; +} -For this test case: +compiles to (x86-32): -define double @foo(double %x) { - %y = mul double %x, 5.000000e-01 - ret double %y +define float @foo(i8 zeroext %x) nounwind { + %tmp12 = uitofp i8 %x to float ; [#uses=1] + ret float %tmp12 } -llc -march=x86-64 currently produces a 32-bit constant pool entry and this code: +compiles to: + +_foo: + subl $4, %esp + movzbl 8(%esp), %eax + cvtsi2ss %eax, %xmm0 + movss %xmm0, (%esp) + flds (%esp) + addl $4, %esp + ret + +We should be able to use: + cvtsi2ss 8($esp), %xmm0 +since we know the stack slot is already zext'd. - cvtss2sd .LCPI1_0(%rip), %xmm1 - mulsd %xmm1, %xmm0 +//===---------------------------------------------------------------------===// -instead of just using a 64-bit constant pool entry with this: +Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) +when code size is critical. movlps is slower than movsd on core2 but it's one +byte shorter. - mulsd .LCPI1_0(%rip), %xmm0 +//===---------------------------------------------------------------------===// -This is due to the code in ExpandConstantFP in LegalizeDAG.cpp. It notices that -x86-64 indeed has an instruction to load a 32-bit float from memory and convert -it into a 64-bit float in a register, however it doesn't notice that this isn't -beneficial because it prevents the load from being folded into the multiply. +We should use a dynamic programming based approach to tell when using FPStack +operations is cheaper than SSE. SciMark montecarlo contains code like this +for example: + +double MonteCarlo_num_flops(int Num_samples) { + return ((double) Num_samples)* 4.0; +} + +In fpstack mode, this compiles into: + +LCPI1_0: + .long 1082130432 ## float 4.000000e+00 +_MonteCarlo_num_flops: + subl $4, %esp + movl 8(%esp), %eax + movl %eax, (%esp) + fildl (%esp) + fmuls LCPI1_0 + addl $4, %esp + ret + +in SSE mode, it compiles into significantly slower code: + +_MonteCarlo_num_flops: + subl $12, %esp + cvtsi2sd 16(%esp), %xmm0 + mulsd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + fldl (%esp) + addl $12, %esp + ret + +There are also other cases in scimark where using fpstack is better, it is +cheaper to do fld1 than load from a constant pool for example, so +"load, add 1.0, store" is better done in the fp stack, etc. //===---------------------------------------------------------------------===// -In this loop: +The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to +"cmpsd". For example, this code: -bb49: ; preds = %bb49, %bb49.preheader - %indvar = phi i32 [ 0, %bb49.preheader ], [ %indvar.next, %bb49 ] ; [#uses=2] - %dp.089.0.rec = shl i32 %indvar, 3 ; [#uses=2] - %dp.089.0 = getelementptr i32* %tmp89, i32 %dp.089.0.rec ; [#uses=1] - %tmp5051 = bitcast i32* %dp.089.0 to <2 x i64>* ; <<2 x i64>*> [#uses=1] - store <2 x i64> zeroinitializer, <2 x i64>* %tmp5051, align 16 - %dp.089.0.sum105 = or i32 %dp.089.0.rec, 4 ; [#uses=1] - %tmp56 = getelementptr i32* %tmp89, i32 %dp.089.0.sum105 ; [#uses=1] - %tmp5657 = bitcast i32* %tmp56 to <2 x i64>* ; <<2 x i64>*> [#uses=1] - store <2 x i64> zeroinitializer, <2 x i64>* %tmp5657, align 16 - %indvar.next = add i32 %indvar, 1 ; [#uses=2] - %exitcond = icmp eq i32 %indvar.next, %tmp98 ; [#uses=1] - br i1 %exitcond, label %bb72, label %bb49 +double d1(double x) { return x == x ? x : x + x; } -we get: +Compiles into: -LBB5_6: # bb49.preheader - shlw $2, %si - decw %si - movzwl %si, %eax - incl %eax - xorl %ecx, %ecx -LBB5_7: # bb49 - xorps %xmm0, %xmm0 # (1) - movaps %xmm0, (%edx) - movaps %xmm0, 16(%edx) - addl $32, %edx - incl %ecx - cmpl %eax, %ecx - jne LBB4_7 # bb47 +_d1: + ucomisd %xmm0, %xmm0 + jnp LBB1_2 + addsd %xmm0, %xmm0 + ret +LBB1_2: + ret -The instruction at (1) can be moved out of the main body of the loop. +Also, the 'ret's should be shared. This is PR6032. //===---------------------------------------------------------------------===// -These functions: +These should compile into the same code (PR6214): Perhaps instcombine should +canonicalize the former into the later? -#include -__m128i a; -void x(unsigned short n) { - a = _mm_slli_epi32 (a, n); +define float @foo(float %x) nounwind { + %t = bitcast float %x to i32 + %s = and i32 %t, 2147483647 + %d = bitcast i32 %s to float + ret float %d } -void y(unsigned n) { - a = _mm_slli_epi32 (a, n); + +declare float @fabsf(float %n) +define float @bar(float %x) nounwind { + %d = call float @fabsf(float %x) + ret float %d } -compile to ( -O3 -static -fomit-frame-pointer): -_x: - movzwl 4(%esp), %eax - movd %eax, %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret -_y: - movd 4(%esp), %xmm0 - movaps _a, %xmm1 - pslld %xmm0, %xmm1 - movaps %xmm1, _a - ret +//===---------------------------------------------------------------------===// -"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems -like movd would be sufficient in both cases as the value is already zero -extended in the 32-bit stack slot IIRC. For signed short, it should also be -save, as a really-signed value would be undefined for pslld. +This IR (from PR6194): + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin10.0.0" + +%0 = type { double, double } +%struct.float3 = type { float, float, float } + +define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { +entry: + %tmp18 = extractvalue %0 %0, 0 ; [#uses=1] + %tmp19 = bitcast double %tmp18 to i64 ; [#uses=1] + %tmp20 = zext i64 %tmp19 to i128 ; [#uses=1] + %tmp10 = lshr i128 %tmp20, 32 ; [#uses=1] + %tmp11 = trunc i128 %tmp10 to i32 ; [#uses=1] + %tmp12 = bitcast i32 %tmp11 to float ; [#uses=1] + %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; [#uses=1] + store float %tmp12, float* %tmp5 + ret void +} + +Compiles to: + +_test: ## @test + movd %xmm0, %rax + shrq $32, %rax + movl %eax, 4(%rdi) + ret +This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and +doing a shuffle from v[1] to v[0] then a float store. //===---------------------------------------------------------------------===// -#include -int t1(double d) { return signbit(d); } +On SSE4 machines, we compile this code: -This currently compiles to: - subl $12, %esp - movsd 16(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 4(%esp), %eax - shrl $31, %eax - addl $12, %esp +define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, + <2 x float> *%P) nounwind { + %Z = fadd <2 x float> %Q, %R + + store <2 x float> %Z, <2 x float> *%P + ret <2 x float> %Z +} + +into: + +_test2: ## @test2 +## BB#0: + insertps $0, %xmm2, %xmm2 + insertps $16, %xmm3, %xmm2 + insertps $0, %xmm0, %xmm3 + insertps $16, %xmm1, %xmm3 + addps %xmm2, %xmm3 + movq %xmm3, (%rdi) + movaps %xmm3, %xmm0 + pshufd $1, %xmm3, %xmm1 + ## kill: XMM1 XMM1 ret -We should use movmskp{s|d} instead. +The insertps's of $0 are pointless complex copies. + +//===---------------------------------------------------------------------===// + +[UNSAFE FP] + +void foo(double, double, double); +void norm(double x, double y, double z) { + double scale = __builtin_sqrt(x*x + y*y + z*z); + foo(x/scale, y/scale, z/scale); +} + +We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is +slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first +and emit 3 mulsd in place of the divs. This can be done as a target-independent +transform. + +If we're dealing with floats instead of doubles we could even replace the sqrtss +and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the +cost of reduced accuracy. + +//===---------------------------------------------------------------------===// +This function should be matched to haddpd when the appropriate CPU is enabled: + +#include +double f (__m128d p) { + return p[0] + p[1]; +} + +similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should +turn into hsubpd also. + +//===---------------------------------------------------------------------===//