X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=073e2dacef183b26bab7ac32332d4a19f2702471;hb=c59e52108bbfca50b23c5d10706484d4b012c344;hp=5084467657ec8f04dd0a149a05c69d6715bafb8b;hpb=2420d812475ebbb835585db1b2bbad04e55cb6f3;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 5084467657e..073e2dacef1 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,6 +2,14 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// +Missing features: + - Support for SSE4: http://www.intel.com/software/penryn +http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf + - support for 3DNow! + - weird abis? + +//===---------------------------------------------------------------------===// + Add a MUL2U and MUL2S nodes to represent a multiply that returns both the Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to X86, & make the dag combiner produce it when needed. This will eliminate one @@ -18,6 +26,26 @@ long long test(int X, int Y) { return (long long)X*Y; } ... which should only be one imul instruction. +or: + +unsigned long long int t2(unsigned int a, unsigned int b) { + return (unsigned long long)a * b; +} + +... which should be one mul instruction. + + +This can be done with a custom expander, but it would be nice to move this to +generic code. + +//===---------------------------------------------------------------------===// + +CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 +backend knows how to three-addressify this shift, but it appears the register +allocator isn't even asking it to do so in this case. We should investigate +why this isn't happening, it could have significant impact on other important +cases for X86 as well. + //===---------------------------------------------------------------------===// This should be one DIV/IDIV instruction, not a libcall: @@ -45,6 +73,20 @@ http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html Another useful one would be ~0ULL >> X and ~0ULL << X. +One better solution for 1LL << x is: + xorl %eax, %eax + xorl %edx, %edx + testb $32, %cl + sete %al + setne %dl + sall %cl, %eax + sall %cl, %edx + +But that requires good 8-bit subreg support. + +64-bit shifts (in general) expand to really bad code. Instead of using +cmovs, we should expand to a conditional branch like GCC produces. + //===---------------------------------------------------------------------===// Compile this: @@ -80,15 +122,6 @@ allocator. Delay codegen until post register allocation. //===---------------------------------------------------------------------===// -Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g. - - cmpl $1, %eax - setg %al - testb %al, %al # unnecessary - jne .BB7 - -//===---------------------------------------------------------------------===// - Count leading zeros and count trailing zeros: int clz(int X) { return __builtin_clz(X); } @@ -106,11 +139,18 @@ ctz: however, check that these are defined for 0 and 32. Our intrinsics are, GCC's aren't. +Another example (use predsimplify to eliminate a select): + +int foo (unsigned long j) { + if (j) + return __builtin_ffs (j) - 1; + else + return 0; +} + //===---------------------------------------------------------------------===// -Use push/pop instructions in prolog/epilog sequences instead of stores off -ESP (certain code size win, perf win on some [which?] processors). -Also, it appears icc use push for parameter passing. Need to investigate. +It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// @@ -126,6 +166,8 @@ commutative, it is not matched with the load on both sides. The dag combiner should be made smart enough to cannonicalize the load into the RHS of a compare when it can invert the result of the compare for free. +//===---------------------------------------------------------------------===// + How about intrinsics? An example is: *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C)); @@ -140,128 +182,6 @@ target specific hook. //===---------------------------------------------------------------------===// -When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and -other fast SSE modes. - -//===---------------------------------------------------------------------===// - -Think about doing i64 math in SSE regs. - -//===---------------------------------------------------------------------===// - -The DAG Isel doesn't fold the loads into the adds in this testcase. The -pattern selector does. This is because the chain value of the load gets -selected first, and the loads aren't checking to see if they are only used by -and add. - -.ll: - -int %test(int* %x, int* %y, int* %z) { - %X = load int* %x - %Y = load int* %y - %Z = load int* %z - %a = add int %X, %Y - %b = add int %a, %Z - ret int %b -} - -dag isel: - -_test: - movl 4(%esp), %eax - movl (%eax), %eax - movl 8(%esp), %ecx - movl (%ecx), %ecx - addl %ecx, %eax - movl 12(%esp), %ecx - movl (%ecx), %ecx - addl %ecx, %eax - ret - -pattern isel: - -_test: - movl 12(%esp), %ecx - movl 4(%esp), %edx - movl 8(%esp), %eax - movl (%eax), %eax - addl (%edx), %eax - addl (%ecx), %eax - ret - -This is bad for register pressure, though the dag isel is producing a -better schedule. :) - -//===---------------------------------------------------------------------===// - -This testcase should have no SSE instructions in it, and only one load from -a constant pool: - -double %test3(bool %B) { - %C = select bool %B, double 123.412, double 523.01123123 - ret double %C -} - -Currently, the select is being lowered, which prevents the dag combiner from -turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' - -The pattern isel got this one right. - -//===---------------------------------------------------------------------===// - -SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction -like this: - - X += y - -and the register allocator decides to spill X, it is cheaper to emit this as: - -Y += [xslot] -store Y -> [xslot] - -than as: - -tmp = [xslot] -tmp += y -store tmp -> [xslot] - -..and this uses one fewer register (so this should be done at load folding -time, not at spiller time). *Note* however that this can only be done -if Y is dead. Here's a testcase: - -%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0] -implementation ; Functions: -declare void %printf(int, ...) -void %main() { -build_tree.exit: - br label %no_exit.i7 -no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit - %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; [#uses=1] - %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; [#uses=1] - %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00 - %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00 - br bool false, label %Compute_Tree.exit23, label %no_exit.i7 -Compute_Tree.exit23: ; preds = %no_exit.i7 - tail call void (int, ...)* %printf( int 0 ) - store double %tmp.34.i18, double* null - ret void -} - -We currently emit: - -.BBmain_1: - xorpd %XMM1, %XMM1 - addsd %XMM0, %XMM1 -*** movsd %XMM2, QWORD PTR [%ESP + 8] -*** addsd %XMM2, %XMM1 -*** movsd QWORD PTR [%ESP + 8], %XMM2 - jmp .BBmain_1 # no_exit.i7 - -This is a bugpoint reduced testcase, which is why the testcase doesn't make -much sense (e.g. its an infinite loop). :) - -//===---------------------------------------------------------------------===// - In many cases, LLVM generates code like this: _test: @@ -275,7 +195,7 @@ on some processors (which ones?), it is more efficient to do this: _test: movl 8(%esp), %ebx - xor %eax, %eax + xor %eax, %eax cmpl %ebx, 4(%esp) setl %al ret @@ -284,68 +204,6 @@ Doing this correctly is tricky though, as the xor clobbers the flags. //===---------------------------------------------------------------------===// -We should generate 'test' instead of 'cmp' in various cases, e.g.: - -bool %test(int %X) { - %Y = shl int %X, ubyte 1 - %C = seteq int %Y, 0 - ret bool %C -} -bool %test(int %X) { - %Y = and int %X, 8 - %C = seteq int %Y, 0 - ret bool %C -} - -This may just be a matter of using 'test' to write bigger patterns for X86cmp. - -An important case is comparison against zero: - -if (X == 0) ... - -instead of: - - cmpl $0, %eax - je LBB4_2 #cond_next - -use: - test %eax, %eax - jz LBB4_2 - -which is smaller. - -//===---------------------------------------------------------------------===// - -SSE should implement 'select_cc' using 'emulated conditional moves' that use -pcmp/pand/pandn/por to do a selection instead of a conditional branch: - -double %X(double %Y, double %Z, double %A, double %B) { - %C = setlt double %A, %B - %z = add double %Z, 0.0 ;; select operand is not a load - %D = select bool %C, double %Y, double %z - ret double %D -} - -We currently emit: - -_X: - subl $12, %esp - xorpd %xmm0, %xmm0 - addsd 24(%esp), %xmm0 - movsd 32(%esp), %xmm1 - movsd 16(%esp), %xmm2 - ucomisd 40(%esp), %xmm1 - jb LBB_X_2 -LBB_X_1: - movsd %xmm0, %xmm2 -LBB_X_2: - movsd %xmm2, (%esp) - fldl (%esp) - addl $12, %esp - ret - -//===---------------------------------------------------------------------===// - We should generate bts/btr/etc instructions on targets where they are cheap or when codesize is important. e.g., for: @@ -375,12 +233,6 @@ when we can spare a register. It reduces code size. //===---------------------------------------------------------------------===// -It's not clear whether we should use pxor or xorps / xorpd to clear XMM -registers. The choice may depend on subtarget information. We should do some -more experiments on different x86 machines. - -//===---------------------------------------------------------------------===// - Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently get this: @@ -412,36 +264,6 @@ which is probably slower, but it's interesting at least :) //===---------------------------------------------------------------------===// -Currently the x86 codegen isn't very good at mixing SSE and FPStack -code: - -unsigned int foo(double x) { return x; } - -foo: - subl $20, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, 8(%esp) - fldl 8(%esp) - fisttpll (%esp) - movl (%esp), %eax - addl $20, %esp - ret - -This will be solved when we go to a dynamic programming based isel. - -//===---------------------------------------------------------------------===// - -Should generate min/max for stuff like: - -void minf(float a, float b, float *X) { - *X = a <= b ? a : b; -} - -Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN -and ISD::FMAX node types? - -//===---------------------------------------------------------------------===// - The first BB of this code: declare bool %foo() @@ -472,22 +294,6 @@ Enable X86InstrInfo::convertToThreeAddress(). //===---------------------------------------------------------------------===// -Investigate whether it is better to codegen the following - - %tmp.1 = mul int %x, 9 -to - - movl 4(%esp), %eax - leal (%eax,%eax,8), %eax - -as opposed to what llc is currently generating: - - imull $9, 4(%esp), %eax - -Currently the load folding imull has a higher complexity than the LEA32 pattern. - -//===---------------------------------------------------------------------===// - We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl We should leave these as libcalls for everything over a much lower threshold, since libc is hand tuned for medium and large mem ops (avoiding RFO for large @@ -495,45 +301,6 @@ stores, TLB preheating, etc) //===---------------------------------------------------------------------===// -Lower memcpy / memset to a series of SSE 128 bit move instructions when it's -feasible. - -//===---------------------------------------------------------------------===// - -Teach the coalescer to commute 2-addr instructions, allowing us to eliminate -the reg-reg copy in this example: - -float foo(int *x, float *y, unsigned c) { - float res = 0.0; - unsigned i; - for (i = 0; i < c; i++) { - float xx = (float)x[i]; - xx = xx * y[i]; - xx += res; - res = xx; - } - return res; -} - -LBB_foo_3: # no_exit - cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI] - mulss %XMM0, DWORD PTR [%EAX + 4*%ESI] - addss %XMM0, %XMM1 - inc %ESI - cmp %ESI, %ECX -**** movaps %XMM1, %XMM0 - jb LBB_foo_3 # no_exit - -//===---------------------------------------------------------------------===// - -Codegen: - if (copysign(1.0, x) == copysign(1.0, y)) -into: - if (x^y & mask) -when using SSE. - -//===---------------------------------------------------------------------===// - Optimize this into something reasonable: x * copysign(1.0, y) * copysign(1.0, z) @@ -595,52 +362,64 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. //===---------------------------------------------------------------------===// -Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / -FR64 to VR128. +We are generating far worse code than gcc: -//===---------------------------------------------------------------------===// +volatile short X, Y; -mov $reg, 48(%esp) -... -leal 48(%esp), %eax -mov %eax, (%esp) -call _foo +void foo(int N) { + int i; + for (i = 0; i < N; i++) { X = i; Y = i*4; } +} -Obviously it would have been better for the first mov (or any op) to store -directly %esp[0] if there are no other uses. +LBB1_1: #bb.preheader + xorl %ecx, %ecx + xorw %dx, %dx +LBB1_2: #bb + movl L_X$non_lazy_ptr, %esi + movw %dx, (%esi) + movw %dx, %si + shlw $2, %si + movl L_Y$non_lazy_ptr, %edi + movw %si, (%edi) + incl %ecx + incw %dx + cmpl %eax, %ecx + jne LBB1_2 #bb -//===---------------------------------------------------------------------===// +vs. -Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half -of a v4sf value. + xorl %edx, %edx + movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi + movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx +L4: + movw %dx, (%esi) + leal 0(,%edx,4), %eax + movw %ax, (%ecx) + addl $1, %edx + cmpl %edx, %edi + jne L4 -//===---------------------------------------------------------------------===// +There are 3 issues: -Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. -Perhaps use pxor / xorp* to clear a XMM register first? +1. Lack of post regalloc LICM. +2. LSR unable to reused IV for a different type (i16 vs. i32) even though + the cast would be free. //===---------------------------------------------------------------------===// -Better codegen for: - -void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; } -void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; } +Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / +FR64 to VR128. -For the later we generate: +//===---------------------------------------------------------------------===// -_f: - pxor %xmm0, %xmm0 - movss 8(%esp), %xmm1 - movaps %xmm0, %xmm2 - unpcklps %xmm1, %xmm2 - movss 4(%esp), %xmm1 - unpcklps %xmm0, %xmm1 - unpcklps %xmm2, %xmm1 - movl 12(%esp), %eax - movaps %xmm1, (%eax) - ret +mov $reg, 48(%esp) +... +leal 48(%esp), %eax +mov %eax, (%esp) +call _foo -This seems like it should use shufps, one for each of a & b. +Obviously it would have been better for the first mov (or any op) to store +directly %esp[0] if there are no other uses. //===---------------------------------------------------------------------===// @@ -676,448 +455,661 @@ We probably need some kind of target DAG combine hook to fix this. //===---------------------------------------------------------------------===// -How to decide when to use the "floating point version" of logical ops? Here are -some code fragments: - - movaps LCPI5_5, %xmm2 - divps %xmm1, %xmm2 - mulps %xmm2, %xmm3 - mulps 8656(%ecx), %xmm3 - addps 8672(%ecx), %xmm3 - andps LCPI5_6, %xmm2 - andps LCPI5_1, %xmm3 - por %xmm2, %xmm3 - movdqa %xmm3, (%edi) - - movaps LCPI5_5, %xmm1 - divps %xmm0, %xmm1 - mulps %xmm1, %xmm3 - mulps 8656(%ecx), %xmm3 - addps 8672(%ecx), %xmm3 - andps LCPI5_6, %xmm1 - andps LCPI5_1, %xmm3 - orps %xmm1, %xmm3 - movaps %xmm3, 112(%esp) - movaps %xmm3, (%ebx) - -Due to some minor source change, the later case ended up using orps and movaps -instead of por and movdqa. Does it matter? +We generate significantly worse code for this than GCC: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 +http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 + +There is also one case we do worse on PPC. //===---------------------------------------------------------------------===// -Use movddup to splat a v2f64 directly from a memory source. e.g. +If shorter, we should use things like: +movzwl %ax, %eax +instead of: +andl $65535, %EAX + +The former can also be used when the two-addressy nature of the 'and' would +require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). + +//===---------------------------------------------------------------------===// -#include +Consider this: -void test(__m128d *r, double A) { - *r = _mm_set1_pd(A); +typedef struct pair { float A, B; } pair; +void pairtest(pair P, float *FP) { + *FP = P.A+P.B; } -llc: +We currently generate this code with llvmgcc4: -_test: - movsd 8(%esp), %xmm0 - unpcklpd %xmm0, %xmm0 - movl 4(%esp), %eax - movapd %xmm0, (%eax) - ret +_pairtest: + movl 8(%esp), %eax + movl 4(%esp), %ecx + movd %eax, %xmm0 + movd %ecx, %xmm1 + addss %xmm0, %xmm1 + movl 12(%esp), %eax + movss %xmm1, (%eax) + ret + +we should be able to generate: +_pairtest: + movss 4(%esp), %xmm0 + movl 12(%esp), %eax + addss 8(%esp), %xmm0 + movss %xmm0, (%eax) + ret -icc: +The issue is that llvmgcc4 is forcing the struct to memory, then passing it as +integer chunks. It does this so that structs like {short,short} are passed in +a single 32-bit integer stack slot. We should handle the safe cases above much +nicer, while still handling the hard cases. -_test: - movl 4(%esp), %eax - movddup 8(%esp), %xmm0 - movapd %xmm0, (%eax) +While true in general, in this specific case we could do better by promoting +load int + bitcast to float -> load fload. This basically needs alignment info, +the code is already implemented (but disabled) in dag combine). + +//===---------------------------------------------------------------------===// + +Another instruction selector deficiency: + +void %bar() { + %tmp = load int (int)** %foo + %tmp = tail call int %tmp( int 3 ) + ret void +} + +_bar: + subl $12, %esp + movl L_foo$non_lazy_ptr, %eax + movl (%eax), %eax + call *%eax + addl $12, %esp ret +The current isel scheme will not allow the load to be folded in the call since +the load's chain result is read by the callseq_start. + //===---------------------------------------------------------------------===// -X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible -to choose between movaps, movapd, and movdqa based on types of source and -destination? +Don't forget to find a way to squash noop truncates in the JIT environment. -How about andps, andpd, and pand? Do we really care about the type of the packed -elements? If not, why not always use the "ps" variants which are likely to be -shorter. +//===---------------------------------------------------------------------===// + +Implement anyext in the same manner as truncate that would allow them to be +eliminated. //===---------------------------------------------------------------------===// -We are emitting bad code for this: +How about implementing truncate / anyext as a property of machine instruction +operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register. +Do this for the cases where a truncate / anyext is guaranteed to be eliminated. +For IA32 that is truncate from 32 to 16 and anyext from 16 to 32. -float %test(float* %V, int %I, int %D, float %V) { -entry: - %tmp = seteq int %D, 0 - br bool %tmp, label %cond_true, label %cond_false23 - -cond_true: - %tmp3 = getelementptr float* %V, int %I - %tmp = load float* %tmp3 - %tmp5 = setgt float %tmp, %V - %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V ) - %tmp7 = or bool %tmp5, %tmp6 - br bool %tmp7, label %UnifiedReturnBlock, label %cond_next - -cond_next: - %tmp10 = add int %I, 1 - %tmp12 = getelementptr float* %V, int %tmp10 - %tmp13 = load float* %tmp12 - %tmp15 = setle float %tmp13, %V - %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V ) - %tmp17 = or bool %tmp15, %tmp16 - %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00 - ret float %retval - -cond_false23: - %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V ) - ret float %tmp28 - -UnifiedReturnBlock: ; preds = %cond_true - ret float 0.000000e+00 +//===---------------------------------------------------------------------===// + +For this: + +int test(int a) +{ + return a * 3; } -declare bool %llvm.isunordered.f32(float, float) +We currently emits + imull $3, 4(%esp), %eax + +Perhaps this is what we really should generate is? Is imull three or four +cycles? Note: ICC generates this: + movl 4(%esp), %eax + leal (%eax,%eax,2), %eax + +The current instruction priority is based on pattern complexity. The former is +more "complex" because it folds a load so the latter will not be emitted. + +Perhaps we should use AddedComplexity to give LEA32r a higher priority? We +should always try to match LEA first since the LEA matching code does some +estimate to determine whether the match is profitable. -declare float %foo(float*, int, int, float) +However, if we care more about code size, then imull is better. It's two bytes +shorter than movl + leal. +//===---------------------------------------------------------------------===// -It exposes a known load folding problem: +Implement CTTZ, CTLZ with bsf and bsr. - movss (%edx,%ecx,4), %xmm1 - ucomiss %xmm1, %xmm0 +//===---------------------------------------------------------------------===// -As well as this: +It appears gcc place string data with linkonce linkage in +.section __TEXT,__const_coal,coalesced instead of +.section __DATA,__const_coal,coalesced. +Take a look at darwin.h, there are other Darwin assembler directives that we +do not make use of. -LBB_test_2: # cond_next - movss LCPI1_0, %xmm2 - pxor %xmm3, %xmm3 - ucomiss %xmm0, %xmm1 - jbe LBB_test_6 # cond_next -LBB_test_5: # cond_next - movaps %xmm2, %xmm3 -LBB_test_6: # cond_next - movss %xmm3, 40(%esp) - flds 40(%esp) - addl $44, %esp - ret +//===---------------------------------------------------------------------===// + +int %foo(int* %a, int %t) { +entry: + br label %cond_true + +cond_true: ; preds = %cond_true, %entry + %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ] + %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ] + %tmp2 = getelementptr int* %a, int %x.0.0 + %tmp3 = load int* %tmp2 ; [#uses=1] + %tmp5 = add int %t_addr.0.0, %x.0.0 ; [#uses=1] + %tmp7 = add int %tmp5, %tmp3 ; [#uses=2] + %tmp9 = add int %x.0.0, 1 ; [#uses=2] + %tmp = setgt int %tmp9, 39 ; [#uses=1] + br bool %tmp, label %bb12, label %cond_true + +bb12: ; preds = %cond_true + ret int %tmp7 +} + +is pessimized by -loop-reduce and -indvars + +//===---------------------------------------------------------------------===// -Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting -three moves (movss, movaps, movss). +u32 to float conversion improvement: + +float uint32_2_float( unsigned u ) { + float fl = (int) (u & 0xffff); + float fh = (int) (u >> 16); + fh *= 0x1.0p16f; + return fh + fl; +} + +00000000 subl $0x04,%esp +00000003 movl 0x08(%esp,1),%eax +00000007 movl %eax,%ecx +00000009 shrl $0x10,%ecx +0000000c cvtsi2ss %ecx,%xmm0 +00000010 andl $0x0000ffff,%eax +00000015 cvtsi2ss %eax,%xmm1 +00000019 mulss 0x00000078,%xmm0 +00000021 addss %xmm1,%xmm0 +00000025 movss %xmm0,(%esp,1) +0000002a flds (%esp,1) +0000002d addl $0x04,%esp +00000030 ret //===---------------------------------------------------------------------===// -External test Nurbs exposed some problems. Look for -__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc -emits: - - movaps (%edx), %xmm2 #59.21 - movaps (%edx), %xmm5 #60.21 - movaps (%edx), %xmm4 #61.21 - movaps (%edx), %xmm3 #62.21 - movl 40(%ecx), %ebp #69.49 - shufps $0, %xmm2, %xmm5 #60.21 - movl 100(%esp), %ebx #69.20 - movl (%ebx), %edi #69.20 - imull %ebp, %edi #69.49 - addl (%eax), %edi #70.33 - shufps $85, %xmm2, %xmm4 #61.21 - shufps $170, %xmm2, %xmm3 #62.21 - shufps $255, %xmm2, %xmm2 #63.21 - lea (%ebp,%ebp,2), %ebx #69.49 - negl %ebx #69.49 - lea -3(%edi,%ebx), %ebx #70.33 - shll $4, %ebx #68.37 - addl 32(%ecx), %ebx #68.37 - testb $15, %bl #91.13 - jne L_B1.24 # Prob 5% #91.13 - -This is the llvm code after instruction scheduling: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %reg1078 = MOV32ri -3 - %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 - %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 - %reg1080 = IMUL32rr %reg1079, %reg1037 - %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 - %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 - %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 - %reg1082 = SHL32ri %reg1038, 4 - %reg1039 = ADD32rr %reg1036, %reg1082 - %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 - %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 - %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 - %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 - %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 - %reg1040 = MOV32rr %reg1039 - %reg1084 = AND32ri8 %reg1039, 15 - CMP32ri8 %reg1084, 0 - JE mbb - -Still ok. After register allocation: - -cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %EAX = MOV32ri -3 - %EDX = MOV32rm , 1, %NOREG, 0 - ADD32rm %EAX, %EDX, 1, %NOREG, 0 - %EDX = MOV32rm , 1, %NOREG, 0 - %EDX = MOV32rm %EDX, 1, %NOREG, 40 - IMUL32rr %EAX, %EDX - %ESI = MOV32rm , 1, %NOREG, 0 - %ESI = MOV32rm %ESI, 1, %NOREG, 0 - MOV32mr , 1, %NOREG, 0, %ESI - %EAX = LEA32r %ESI, 1, %EAX, -3 - %ESI = MOV32rm , 1, %NOREG, 0 - %ESI = MOV32rm %ESI, 1, %NOREG, 32 - %EDI = MOV32rr %EAX - SHL32ri %EDI, 4 - ADD32rr %EDI, %ESI - %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 - %XMM1 = MOVAPSrr %XMM0 - SHUFPSrr %XMM1, %XMM1, 170 - %XMM2 = MOVAPSrr %XMM0 - SHUFPSrr %XMM2, %XMM2, 0 - %XMM3 = MOVAPSrr %XMM0 - SHUFPSrr %XMM3, %XMM3, 255 - SHUFPSrr %XMM0, %XMM0, 85 - %EBX = MOV32rr %EDI - AND32ri8 %EBX, 15 - CMP32ri8 %EBX, 0 - JE mbb - -This looks really bad. The problem is shufps is a destructive opcode. Since it -appears as operand two in more than one shufps ops. It resulted in a number of -copies. Note icc also suffers from the same problem. Either the instruction -selector should select pshufd or The register allocator can made the two-address -to three-address transformation. - -It also exposes some other problems. See MOV32ri -3 and the spills. +When using fastcc abi, align stack slot of argument of type double on 8 byte +boundary to improve performance. //===---------------------------------------------------------------------===// -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500 - -LLVM is producing bad code. - -LBB_main_4: # cond_true44 - addps %xmm1, %xmm2 - subps %xmm3, %xmm2 - movaps (%ecx), %xmm4 - movaps %xmm2, %xmm1 - addps %xmm4, %xmm1 - addl $16, %ecx - incl %edx - cmpl $262144, %edx - movaps %xmm3, %xmm2 - movaps %xmm4, %xmm3 - jne LBB_main_4 # cond_true44 - -There are two problems. 1) No need to two loop induction variables. We can -compare against 262144 * 16. 2) Known register coalescer issue. We should -be able eliminate one of the movaps: - - addps %xmm2, %xmm1 <=== Commute! - subps %xmm3, %xmm1 - movaps (%ecx), %xmm4 - movaps %xmm1, %xmm1 <=== Eliminate! - addps %xmm4, %xmm1 - addl $16, %ecx - incl %edx - cmpl $262144, %edx - movaps %xmm3, %xmm2 - movaps %xmm4, %xmm3 - jne LBB_main_4 # cond_true44 +Codegen: + +int f(int a, int b) { + if (a == 4 || a == 6) + b++; + return b; +} + + +as: + +or eax, 2 +cmp eax, 6 +jz label //===---------------------------------------------------------------------===// -Consider: +GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting +simplifications for integer "x cmp y ? a : b". For example, instead of: -__m128 test(float a) { - return _mm_set_ps(0.0, 0.0, 0.0, a*a); +int G; +void f(int X, int Y) { + G = X < 0 ? 14 : 13; } -This compiles into: +compiling to: -movss 4(%esp), %xmm1 -mulss %xmm1, %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret +_f: + movl $14, %eax + movl $13, %ecx + movl 4(%esp), %edx + testl %edx, %edx + cmovl %eax, %ecx + movl %ecx, _G + ret -Because mulss doesn't modify the top 3 elements, the top elements of -xmm1 are already zero'd. We could compile this to: +it could be: +_f: + movl 4(%esp), %eax + sarl $31, %eax + notl %eax + addl $14, %eax + movl %eax, _G + ret -movss 4(%esp), %xmm0 -mulss %xmm0, %xmm0 -ret +etc. //===---------------------------------------------------------------------===// -Here's a sick and twisted idea. Consider code like this: +Currently we don't have elimination of redundant stack manipulations. Consider +the code: -__m128 test(__m128 a) { - float b = *(float*)&A; - ... - return _mm_set_ps(0.0, 0.0, 0.0, b); +int %main() { +entry: + call fastcc void %test1( ) + call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) ) + ret int 0 } -This might compile to this code: +declare fastcc void %test1() -movaps c(%esp), %xmm1 -xorps %xmm0, %xmm0 -movss %xmm1, %xmm0 -ret +declare fastcc void %test2(sbyte*) -Now consider if the ... code caused xmm1 to get spilled. This might produce -this code: -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) -... +This currently compiles to: + + subl $16, %esp + call _test5 + addl $12, %esp + subl $16, %esp + movl $_test5, (%esp) + call _test6 + addl $12, %esp + +The add\sub pair is really unneeded here. + +//===---------------------------------------------------------------------===// + +We currently compile sign_extend_inreg into two shifts: -xorps %xmm0, %xmm0 -movaps c2(%esp), %xmm1 -movss %xmm1, %xmm0 -ret +long foo(long X) { + return (long)(signed char)X; +} + +becomes: + +_foo: + movl 4(%esp), %eax + shll $24, %eax + sarl $24, %eax + ret + +This could be: + +_foo: + movsbl 4(%esp),%eax + ret + +//===---------------------------------------------------------------------===// -However, since the reload is only used by these instructions, we could -"fold" it into the uses, producing something like this: +Consider the expansion of: + +uint %test3(uint %X) { + %tmp1 = rem uint %X, 255 + ret uint %tmp1 +} + +Currently it compiles to: -movaps c(%esp), %xmm1 -movaps %xmm1, c2(%esp) +... + movl $2155905153, %ecx + movl 8(%esp), %esi + movl %esi, %eax + mull %ecx ... -movss c2(%esp), %xmm0 -ret +This could be "reassociated" into: -... saving two instructions. + movl $2155905153, %eax + movl 8(%esp), %ecx + mull %ecx -The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the the one element instead of 4 elements. -This can be used to simplify a variety of shuffle operations, where the -elements are fixed zeros. +to avoid the copy. In fact, the existing two-address stuff would do this +except that mul isn't a commutative 2-addr instruction. I guess this has +to be done at isel time based on the #uses to mul? //===---------------------------------------------------------------------===// -We generate significantly worse code for this than GCC: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 -http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 +Make sure the instruction which starts a loop does not cross a cacheline +boundary. This requires knowning the exact length of each machine instruction. +That is somewhat complicated, but doable. Example 256.bzip2: -There is also one case we do worse on PPC. +In the new trace, the hot loop has an instruction which crosses a cacheline +boundary. In addition to potential cache misses, this can't help decoding as I +imagine there has to be some kind of complicated decoder reset and realignment +to grab the bytes from the next cacheline. + +532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines +942 942 0x3d03 movl %dh, (1809(%esp, %esi) +937 937 0x3d0a incl %esi +3 3 0x3d0b cmpb %bl, %dl +27 27 0x3d0d jnz 0x000062db //===---------------------------------------------------------------------===// -For this: +In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. + +//===---------------------------------------------------------------------===// + +This could be a single 16-bit load. -#include -void test(__m128d *r, __m128d *A, double B) { - *r = _mm_loadl_pd(*A, &B); +int f(char *p) { + if ((p[0] == 1) & (p[1] == 2)) return 1; + return 0; } -We generates: +//===---------------------------------------------------------------------===// - subl $12, %esp - movsd 24(%esp), %xmm0 - movsd %xmm0, (%esp) - movl 20(%esp), %eax - movapd (%eax), %xmm0 - movlpd (%esp), %xmm0 - movl 16(%esp), %eax - movapd %xmm0, (%eax) - addl $12, %esp - ret +We should inline lrintf and probably other libc functions. -icc generates: +//===---------------------------------------------------------------------===// - movl 4(%esp), %edx #3.6 - movl 8(%esp), %eax #3.6 - movapd (%eax), %xmm0 #4.22 - movlpd 12(%esp), %xmm0 #4.8 - movapd %xmm0, (%edx) #4.3 - ret #5.1 +Start using the flags more. For example, compile: -So icc is smart enough to know that B is in memory so it doesn't load it and -store it back to stack. +int add_zf(int *x, int y, int a, int b) { + if ((*x += y) == 0) + return a; + else + return b; +} -//===---------------------------------------------------------------------===// +to: + addl %esi, (%rdi) + movl %edx, %eax + cmovne %ecx, %eax + ret +instead of: + +_add_zf: + addl (%rdi), %esi + movl %esi, (%rdi) + testl %esi, %esi + cmove %edx, %ecx + movl %ecx, %eax + ret -__m128d test1( __m128d A, __m128d B) { - return _mm_shuffle_pd(A, B, 0x3); +and: + +int add_zf(int *x, int y, int a, int b) { + if ((*x + y) < 0) + return a; + else + return b; } -compiles to +to: + +add_zf: + addl (%rdi), %esi + movl %edx, %eax + cmovns %ecx, %eax + ret + +instead of: -shufpd $3, %xmm1, %xmm0 +_add_zf: + addl (%rdi), %esi + testl %esi, %esi + cmovs %edx, %ecx + movl %ecx, %eax + ret + +//===---------------------------------------------------------------------===// -Perhaps it's better to use unpckhpd instead? +This: +#include +int foo(double X) { return isnan(X); } -unpckhpd %xmm1, %xmm0 +compiles to (-m64): -Don't know if unpckhpd is faster. But it is shorter. +_foo: + pxor %xmm1, %xmm1 + ucomisd %xmm1, %xmm0 + setp %al + movzbl %al, %eax + ret + +the pxor is not needed, we could compare the value against itself. //===---------------------------------------------------------------------===// -If shorter, we should use things like: -movzwl %ax, %eax -instead of: -andl $65535, %EAX +These two functions have identical effects: -The former can also be used when the two-addressy nature of the 'and' would -require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). +unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} +unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} + +We currently compile them to: + +_f: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + movl 8(%esp), %edx + cmpl %edx, %ecx + jne LBB1_2 #UnifiedReturnBlock +LBB1_1: #cond_true + addl $2, %eax + ret +LBB1_2: #UnifiedReturnBlock + movl %ecx, %eax + ret +_f2: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + cmpl 8(%esp), %ecx + sete %cl + movzbl %cl, %ecx + leal 1(%ecx,%eax), %eax + ret + +both of which are inferior to GCC's: + +_f: + movl 4(%esp), %edx + leal 1(%edx), %eax + addl $2, %edx + cmpl 8(%esp), %eax + cmove %edx, %eax + ret +_f2: + movl 4(%esp), %eax + addl $1, %eax + xorl %edx, %edx + cmpl 8(%esp), %eax + sete %dl + addl %edx, %eax + ret //===---------------------------------------------------------------------===// -This code generates ugly code, probably due to costs being off or something: +This code: -void %test(float* %P, <4 x float>* %P2 ) { - %xFloat0.688 = load float* %P - %loadVector37.712 = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3 - store <4 x float> %inFloat3.713, <4 x float>* %P2 - ret void +void test(int X) { + if (X) abort(); } -Generates: +is currently compiled to: _test: - pxor %xmm0, %xmm0 - movd %xmm0, %eax ;; EAX = 0! - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - pinsrw $6, %eax, %xmm0 - shrl $16, %eax ;; EAX = 0 again! - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) + subl $12, %esp + cmpl $0, 16(%esp) + jne LBB1_1 + addl $12, %esp ret +LBB1_1: + call L_abort$stub -It would be better to generate: +It would be better to produce: _test: - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - xor %eax, %eax - pinsrw $6, %eax, %xmm0 - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) + subl $12, %esp + cmpl $0, 16(%esp) + jne L_abort$stub + addl $12, %esp + ret + +This can be applied to any no-return function call that takes no arguments etc. +Alternatively, the stack save/restore logic could be shrink-wrapped, producing +something like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + subl $12, %esp + call L_abort$stub + +Both are useful in different situations. Finally, it could be shrink-wrapped +and tail called, like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 ret +LBB1_1: + pop %eax # realign stack. + call L_abort$stub -or use pxor (to make a zero vector) and shuffle (to insert it). +Though this probably isn't worth it. //===---------------------------------------------------------------------===// -Bad codegen: +We need to teach the codegen to convert two-address INC instructions to LEA +when the flags are dead. For example, on X86-64, compile: + +int foo(int A, int B) { + return A+1; +} -char foo(int x) { return x; } +to: _foo: - movl 4(%esp), %eax - shll $24, %eax - sarl $24, %eax - ret + leal 1(%edi), %eax + ret + +instead of: + +_foo: + incl %edi + movl %edi, %eax + ret + +Another example is: + +;; X's live range extends beyond the shift, so the register allocator +;; cannot coalesce it with Y. Because of this, a copy needs to be +;; emitted before the shift to save the register value before it is +;; clobbered. However, this copy is not needed if the register +;; allocator turns the shift into an LEA. This also occurs for ADD. + +; Check that the shift gets turned into an LEA. +; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: not grep {mov E.X, E.X} + +%G = external global int + +int %test1(int %X, int %Y) { + %Z = add int %X, %Y + volatile store int %Y, int* %G + volatile store int %Z, int* %G + ret int %X +} + +int %test2(int %X) { + %Z = add int %X, 1 ;; inc + volatile store int %Z, int* %G + ret int %X +} + +//===---------------------------------------------------------------------===// + +This: +#include +unsigned test(float f) { + return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f )); +} + +Compiles to: +_test: + movss 4(%esp), %xmm0 + movd %xmm0, %eax + ret + +it should compile to a move from the stack slot directly into eax. DAGCombine +has this xform, but it is currently disabled until the alignment fields of +the load/store nodes are trustworthy. //===---------------------------------------------------------------------===// -Some useful information in the Apple Altivec / SSE Migration Guide: +Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with +a neg instead of a sub instruction. Consider: + +int test(char X) { return 7-X; } + +we currently produce: +_test: + movl $7, %eax + movsbl 4(%esp), %ecx + subl %ecx, %eax + ret + +We would use one fewer register if codegen'd as: + + movsbl 4(%esp), %eax + neg %eax + add $7, %eax + ret + +Note that this isn't beneficial if the load can be folded into the sub. In +this case, we want a sub: -http://developer.apple.com/documentation/Performance/Conceptual/ -Accelerate_sse_migration/index.html +int test(int X) { return 7-X; } +_test: + movl $7, %eax + subl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +For code like: +phi (undef, x) + +We get an implicit def on the undef side. If the phi is spilled, we then get: +implicitdef xmm1 +store xmm1 -> stack + +It should be possible to teach the x86 backend to "fold" the store into the +implicitdef, which just deletes the implicit def. + +These instructions should go away: +#IMPLICIT_DEF %xmm1 +movaps %xmm1, 192(%esp) +movaps %xmm1, 224(%esp) +movaps %xmm1, 176(%esp) + +//===---------------------------------------------------------------------===// + +This is a "commutable two-address" register coallescing deficiency: + +define <4 x float> @test1(<4 x float> %V) { +entry: + %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 > ; <<4 x float>> [#uses=1] + %add = add <4 x float> %tmp8, %V ; <<4 x float>> [#uses=1] + ret <4 x float> %add +} + +this codegens to: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm0, %xmm1 + movaps %xmm1, %xmm0 + ret + +instead of: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm1, %xmm0 + ret -e.g. SSE select using and, andnot, or. Various SSE compare translations.