X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=4464878ce2173c49c827b5a04cb9314243d461b3;hb=b9e126ce7de05d580d9eb3b664487dab07304939;hp=759c7acf389173e3d52de301f3720cb1855c3751;hpb=63079f0757785c5c461bafdd3101ee40aeb717fe;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 759c7acf389..4464878ce21 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,11 +2,8 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// -Missing features: - - Support for SSE4: http://www.intel.com/software/penryn -http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf - - support for 3DNow! - - weird abis? +We should add support for the "movbe" instruction, which does a byte-swapping +copy (3-addr bswap + memory support?) This is available on Atom processors. //===---------------------------------------------------------------------===// @@ -54,6 +51,17 @@ One better solution for 1LL << x is: But that requires good 8-bit subreg support. +Also, this might be better. It's an extra shift, but it's one instruction +shorter, and doesn't stress 8-bit subreg support. +(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, +but without the unnecessary and.) + movl %ecx, %eax + shrl $5, %eax + movl %eax, %edx + xorl $1, %edx + sall %cl, %eax + sall %cl. %edx + 64-bit shifts (in general) expand to really bad code. Instead of using cmovs, we should expand to a conditional branch like GCC produces. @@ -67,6 +75,9 @@ into: xorl $1, %eax ret +(Although note that this isn't a legal way to express the code that llvm-gcc +currently generates for that function.) + //===---------------------------------------------------------------------===// Some isel ideas: @@ -94,34 +105,6 @@ the coalescer how to deal with it though. //===---------------------------------------------------------------------===// -Count leading zeros and count trailing zeros: - -int clz(int X) { return __builtin_clz(X); } -int ctz(int X) { return __builtin_ctz(X); } - -$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel -clz: - bsr %eax, DWORD PTR [%esp+4] - xor %eax, 31 - ret -ctz: - bsf %eax, DWORD PTR [%esp+4] - ret - -however, check that these are defined for 0 and 32. Our intrinsics are, GCC's -aren't. - -Another example (use predsimplify to eliminate a select): - -int foo (unsigned long j) { - if (j) - return __builtin_ffs (j) - 1; - else - return 0; -} - -//===---------------------------------------------------------------------===// - It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// @@ -208,9 +191,9 @@ when we can spare a register. It reduces code size. Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently get this: -int %test1(int %X) { - %Y = div int %X, 8 - ret int %Y +define i32 @test1(i32 %X) { + %Y = sdiv i32 %X, 8 + ret i32 %Y } _test1: @@ -236,32 +219,6 @@ which is probably slower, but it's interesting at least :) //===---------------------------------------------------------------------===// -The first BB of this code: - -declare bool %foo() -int %bar() { - %V = call bool %foo() - br bool %V, label %T, label %F -T: - ret int 1 -F: - call bool %foo() - ret int 12 -} - -compiles to: - -_bar: - subl $12, %esp - call L_foo$stub - xorb $1, %al - testb %al, %al - jne LBB_bar_2 # F - -It would be better to emit "cmp %al, 1" than a xor and test. - -//===---------------------------------------------------------------------===// - We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl We should leave these as libcalls for everything over a much lower threshold, since libc is hand tuned for medium and large mem ops (avoiding RFO for large @@ -278,113 +235,17 @@ Optimize copysign(x, *y) to use an integer load from y. //===---------------------------------------------------------------------===// -%X = weak global int 0 - -void %foo(int %N) { - %N = cast int %N to uint - %tmp.24 = setgt int %N, 0 - br bool %tmp.24, label %no_exit, label %return - -no_exit: - %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ] - %i.0.0 = cast uint %indvar to int - volatile store int %i.0.0, int* %X - %indvar.next = add uint %indvar, 1 - %exitcond = seteq uint %indvar.next, %N - br bool %exitcond, label %return, label %no_exit - -return: - ret void -} - -compiles into: - - .text - .align 4 - .globl _foo -_foo: - movl 4(%esp), %eax - cmpl $1, %eax - jl LBB_foo_4 # return -LBB_foo_1: # no_exit.preheader - xorl %ecx, %ecx -LBB_foo_2: # no_exit - movl L_X$non_lazy_ptr, %edx - movl %ecx, (%edx) - incl %ecx - cmpl %eax, %ecx - jne LBB_foo_2 # no_exit -LBB_foo_3: # return.loopexit -LBB_foo_4: # return - ret - -We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after -remateralization is implemented. This can be accomplished with 1) a target -dependent LICM pass or 2) makeing SelectDAG represent the whole function. - -//===---------------------------------------------------------------------===// - The following tests perform worse with LSR: lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. //===---------------------------------------------------------------------===// -We are generating far worse code than gcc: - -volatile short X, Y; - -void foo(int N) { - int i; - for (i = 0; i < N; i++) { X = i; Y = i*4; } -} - -LBB1_1: # entry.bb_crit_edge - xorl %ecx, %ecx - xorw %dx, %dx -LBB1_2: # bb - movl L_X$non_lazy_ptr, %esi - movw %cx, (%esi) - movl L_Y$non_lazy_ptr, %esi - movw %dx, (%esi) - addw $4, %dx - incl %ecx - cmpl %eax, %ecx - jne LBB1_2 # bb - -vs. - - xorl %edx, %edx - movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi - movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx -L4: - movw %dx, (%esi) - leal 0(,%edx,4), %eax - movw %ax, (%ecx) - addl $1, %edx - cmpl %edx, %edi - jne L4 - -This is due to the lack of post regalloc LICM. - -//===---------------------------------------------------------------------===// - Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / FR64 to VR128. //===---------------------------------------------------------------------===// -mov $reg, 48(%esp) -... -leal 48(%esp), %eax -mov %eax, (%esp) -call _foo - -Obviously it would have been better for the first mov (or any op) to store -directly %esp[0] if there are no other uses. - -//===---------------------------------------------------------------------===// - Adding to the list of cmp / test poor codegen issues: int test(__m128 *A, __m128 *B) { @@ -425,75 +286,6 @@ There is also one case we do worse on PPC. //===---------------------------------------------------------------------===// -If shorter, we should use things like: -movzwl %ax, %eax -instead of: -andl $65535, %EAX - -The former can also be used when the two-addressy nature of the 'and' would -require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). - -//===---------------------------------------------------------------------===// - -Consider this: - -typedef struct pair { float A, B; } pair; -void pairtest(pair P, float *FP) { - *FP = P.A+P.B; -} - -We currently generate this code with llvmgcc4: - -_pairtest: - movl 8(%esp), %eax - movl 4(%esp), %ecx - movd %eax, %xmm0 - movd %ecx, %xmm1 - addss %xmm0, %xmm1 - movl 12(%esp), %eax - movss %xmm1, (%eax) - ret - -we should be able to generate: -_pairtest: - movss 4(%esp), %xmm0 - movl 12(%esp), %eax - addss 8(%esp), %xmm0 - movss %xmm0, (%eax) - ret - -The issue is that llvmgcc4 is forcing the struct to memory, then passing it as -integer chunks. It does this so that structs like {short,short} are passed in -a single 32-bit integer stack slot. We should handle the safe cases above much -nicer, while still handling the hard cases. - -While true in general, in this specific case we could do better by promoting -load int + bitcast to float -> load fload. This basically needs alignment info, -the code is already implemented (but disabled) in dag combine). - -//===---------------------------------------------------------------------===// - -Another instruction selector deficiency: - -void %bar() { - %tmp = load int (int)** %foo - %tmp = tail call int %tmp( int 3 ) - ret void -} - -_bar: - subl $12, %esp - movl L_foo$non_lazy_ptr, %eax - movl (%eax), %eax - call *%eax - addl $12, %esp - ret - -The current isel scheme will not allow the load to be folded in the call since -the load's chain result is read by the callseq_start. - -//===---------------------------------------------------------------------===// - For this: int test(int a) @@ -519,21 +311,30 @@ estimate to determine whether the match is profitable. However, if we care more about code size, then imull is better. It's two bytes shorter than movl + leal. +On a Pentium M, both variants have the same characteristics with regard +to throughput; however, the multiplication has a latency of four cycles, as +opposed to two cycles for the movl+lea variant. + //===---------------------------------------------------------------------===// -Implement CTTZ, CTLZ with bsf and bsr. GCC produces: +__builtin_ffs codegen is messy. -int ctz_(unsigned X) { return __builtin_ctz(X); } -int clz_(unsigned X) { return __builtin_clz(X); } int ffs_(unsigned X) { return __builtin_ffs(X); } -_ctz_: - bsfl 4(%esp), %eax - ret -_clz_: - bsrl 4(%esp), %eax - xorl $31, %eax +llvm produces: +ffs_: + movl 4(%esp), %ecx + bsfl %ecx, %eax + movl $32, %edx + cmove %edx, %eax + incl %eax + xorl %edx, %edx + testl %ecx, %ecx + cmove %edx, %eax ret + +vs gcc: + _ffs_: movl $-1, %edx bsfl 4(%esp), %eax @@ -541,6 +342,15 @@ _ffs_: addl $1, %eax ret +Another example of __builtin_ffs (use predsimplify to eliminate a select): + +int foo (unsigned long j) { + if (j) + return __builtin_ffs (j) - 1; + else + return 0; +} + //===---------------------------------------------------------------------===// It appears gcc place string data with linkonce linkage in @@ -551,25 +361,24 @@ do not make use of. //===---------------------------------------------------------------------===// -int %foo(int* %a, int %t) { +define i32 @foo(i32* %a, i32 %t) { entry: - br label %cond_true - -cond_true: ; preds = %cond_true, %entry - %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ] - %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ] - %tmp2 = getelementptr int* %a, int %x.0.0 - %tmp3 = load int* %tmp2 ; [#uses=1] - %tmp5 = add int %t_addr.0.0, %x.0.0 ; [#uses=1] - %tmp7 = add int %tmp5, %tmp3 ; [#uses=2] - %tmp9 = add int %x.0.0, 1 ; [#uses=2] - %tmp = setgt int %tmp9, 39 ; [#uses=1] - br bool %tmp, label %bb12, label %cond_true - -bb12: ; preds = %cond_true - ret int %tmp7 + br label %cond_true + +cond_true: ; preds = %cond_true, %entry + %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; [#uses=3] + %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; [#uses=1] + %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; [#uses=1] + %tmp3 = load i32* %tmp2 ; [#uses=1] + %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; [#uses=1] + %tmp7 = add i32 %tmp5, %tmp3 ; [#uses=2] + %tmp9 = add i32 %x.0.0, 1 ; [#uses=2] + %tmp = icmp sgt i32 %tmp9, 39 ; [#uses=1] + br i1 %tmp, label %bb12, label %cond_true + +bb12: ; preds = %cond_true + ret i32 %tmp7 } - is pessimized by -loop-reduce and -indvars //===---------------------------------------------------------------------===// @@ -673,40 +482,11 @@ _usesbb: //===---------------------------------------------------------------------===// -Currently we don't have elimination of redundant stack manipulations. Consider -the code: - -int %main() { -entry: - call fastcc void %test1( ) - call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) ) - ret int 0 -} - -declare fastcc void %test1() - -declare fastcc void %test2(sbyte*) - - -This currently compiles to: - - subl $16, %esp - call _test5 - addl $12, %esp - subl $16, %esp - movl $_test5, (%esp) - call _test6 - addl $12, %esp - -The add\sub pair is really unneeded here. - -//===---------------------------------------------------------------------===// - Consider the expansion of: -uint %test3(uint %X) { - %tmp1 = rem uint %X, 255 - ret uint %tmp1 +define i32 @test3(i32 %X) { + %tmp1 = urem i32 %X, 255 + ret i32 %tmp1 } Currently it compiles to: @@ -740,9 +520,9 @@ imagine there has to be some kind of complicated decoder reset and realignment to grab the bytes from the next cacheline. 532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines -942 942 0x3d03 movl %dh, (1809(%esp, %esi) -937 937 0x3d0a incl %esi -3 3 0x3d0b cmpb %bl, %dl +942 942 0x3d03 movl %dh, (1809(%esp, %esi) +937 937 0x3d0a incl %esi +3 3 0x3d0b cmpb %bl, %dl 27 27 0x3d0d jnz 0x000062db //===---------------------------------------------------------------------===// @@ -948,22 +728,22 @@ Another example is: ;; allocator turns the shift into an LEA. This also occurs for ADD. ; Check that the shift gets turned into an LEA. -; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \ ; RUN: not grep {mov E.X, E.X} -%G = external global int +@G = external global i32 ; [#uses=3] -int %test1(int %X, int %Y) { - %Z = add int %X, %Y - volatile store int %Y, int* %G - volatile store int %Z, int* %G - ret int %X +define i32 @test1(i32 %X, i32 %Y) { + %Z = add i32 %X, %Y ; [#uses=1] + volatile store i32 %Y, i32* @G + volatile store i32 %Z, i32* @G + ret i32 %X } -int %test2(int %X) { - %Z = add int %X, 1 ;; inc - volatile store int %Z, int* %G - ret int %X +define i32 @test2(i32 %X) { + %Z = add i32 %X, 1 ; [#uses=1] + volatile store i32 %Z, i32* @G + ret i32 %X } //===---------------------------------------------------------------------===// @@ -998,51 +778,6 @@ _test: //===---------------------------------------------------------------------===// -For code like: -phi (undef, x) - -We get an implicit def on the undef side. If the phi is spilled, we then get: -implicitdef xmm1 -store xmm1 -> stack - -It should be possible to teach the x86 backend to "fold" the store into the -implicitdef, which just deletes the implicit def. - -These instructions should go away: -#IMPLICIT_DEF %xmm1 -movaps %xmm1, 192(%esp) -movaps %xmm1, 224(%esp) -movaps %xmm1, 176(%esp) - -//===---------------------------------------------------------------------===// - -This is a "commutable two-address" register coallescing deficiency: - -define <4 x float> @test1(<4 x float> %V) { -entry: - %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, - <4 x i32> < i32 3, i32 2, i32 1, i32 0 > - %add = add <4 x float> %tmp8, %V - ret <4 x float> %add -} - -this codegens to: - -_test1: - pshufd $27, %xmm0, %xmm1 - addps %xmm0, %xmm1 - movaps %xmm1, %xmm0 - ret - -instead of: - -_test1: - pshufd $27, %xmm0, %xmm1 - addps %xmm1, %xmm0 - ret - -//===---------------------------------------------------------------------===// - Leaf functions that require one 4-byte spill slot have a prolog like this: _foo: @@ -1119,6 +854,8 @@ Should compile to: setae %al ret +FIXME: That code looks wrong; bool return is normally defined as zext. + on x86-64, not: __Z11no_overflowjj: @@ -1136,34 +873,6 @@ condition register is dead. xor reg reg is shorter than mov reg, #0. //===---------------------------------------------------------------------===// -We aren't matching RMW instructions aggressively -enough. Here's a reduced testcase (more in PR1160): - -define void @test(i32* %huge_ptr, i32* %target_ptr) { - %A = load i32* %huge_ptr ; [#uses=1] - %B = load i32* %target_ptr ; [#uses=1] - %C = or i32 %A, %B ; [#uses=1] - store i32 %C, i32* %target_ptr - ret void -} - -$ llvm-as < t.ll | llc -march=x86-64 - -_test: - movl (%rdi), %eax - orl (%rsi), %eax - movl %eax, (%rsi) - ret - -That should be something like: - -_test: - movl (%rdi), %eax - orl %eax, (%rsi) - ret - -//===---------------------------------------------------------------------===// - The following code: bb114.preheader: ; preds = %cond_next94 @@ -1216,30 +925,6 @@ vice-versa). //===---------------------------------------------------------------------===// -For this code: - -cond_next603: ; preds = %bb493, %cond_true336, %cond_next599 - %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ] ; [#uses=1] - %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ] ; [#uses=2] - %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ] ; [#uses=2] - %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ] ; [#uses=9] - %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ] ; [#uses=4] - %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ] ; [#uses=7] - %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1 ; [#uses=1] - %tmp607 = load i8* %tmp606, align 1 ; [#uses=1] - -We produce this: - -LBB4_70: # cond_next603 - movl -20(%ebp), %esi - movl L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi - -However, ICC caches this information before the loop and produces this: - - movl 88(%esp), %eax #481.12 - -//===---------------------------------------------------------------------===// - This code: %tmp659 = icmp slt i16 %tmp654, 0 ; [#uses=1] @@ -1256,37 +941,6 @@ suggests using the 32-bit register (which is what ICC uses). //===---------------------------------------------------------------------===// -rdar://5506677 - We compile this: - -define i32 @foo(double %x) { - %x14 = bitcast double %x to i64 ; [#uses=1] - %tmp713 = trunc i64 %x14 to i32 ; [#uses=1] - %tmp8 = and i32 %tmp713, 2147483647 ; [#uses=1] - ret i32 %tmp8 -} - -to: - -_foo: - subl $12, %esp - fldl 16(%esp) - fstpl (%esp) - movl $2147483647, %eax - andl (%esp), %eax - addl $12, %esp - #FP_REG_KILL - ret - -It would be much better to eliminate the fldl/fstpl by folding the bitcast -into the load SDNode. That would give us: - -_foo: - movl $2147483647, %eax - andl 4(%esp), %eax - ret - -//===---------------------------------------------------------------------===// - We compile this: void compare (long long foo) { @@ -1296,44 +950,54 @@ void compare (long long foo) { to: -_compare: - subl $12, %esp - cmpl $0, 16(%esp) +compare: + subl $4, %esp + cmpl $0, 8(%esp) setne %al movzbw %al, %ax - cmpl $1, 20(%esp) + cmpl $1, 12(%esp) setg %cl movzbw %cl, %cx cmove %ax, %cx - movw %cx, %ax - testb $1, %al - je LBB1_2 # cond_true + testb $1, %cl + jne .LBB1_2 # UnifiedReturnBlock +.LBB1_1: # ifthen + call abort +.LBB1_2: # UnifiedReturnBlock + addl $4, %esp + ret (also really horrible code on ppc). This is due to the expand code for 64-bit compares. GCC produces multiple branches, which is much nicer: -_compare: - pushl %ebp - movl %esp, %ebp - subl $8, %esp - movl 8(%ebp), %eax - movl 12(%ebp), %edx - subl $1, %edx - jg L5 -L7: - jl L4 +compare: + subl $12, %esp + movl 20(%esp), %edx + movl 16(%esp), %eax + decl %edx + jle .L7 +.L5: + addl $12, %esp + ret + .p2align 4,,7 +.L7: + jl .L4 cmpl $0, %eax - jbe L4 -L5: + .p2align 4,,8 + ja .L5 +.L4: + .p2align 4,,9 + call abort //===---------------------------------------------------------------------===// Tail call optimization improvements: Tail call optimization currently pushes all arguments on the top of the stack (their normal place for -non-tail call optimized calls) before moving them to actual stack -slot. This is done to prevent overwriting of parameters (see example -below) that might be used, since the arguments of the callee -overwrites caller's arguments. +non-tail call optimized calls) that source from the callers arguments +or that source from a virtual register (also possibly sourcing from +callers arguments). +This is done to prevent overwriting of parameters (see example +below) that might be used later. example: @@ -1352,13 +1016,6 @@ arg2 of the caller. Possible optimizations: - - Only push those arguments to the top of the stack that are actual - parameters of the caller function and have no local value in the - caller. - - In the above example local does not need to be pushed onto the top - of the stack as it is definitely not a caller's function - parameter. - Analyse the actual parameters of the callee to see which would overwrite a caller parameter which is used by the callee and only @@ -1380,35 +1037,6 @@ Possible optimizations: Here we need to push the arguments because they overwrite each other. - - Code for lowering directly onto callers arguments: -+ SmallVector, 8> RegsToPass; -+ SmallVector MemOpChains; -+ -+ SDOperand FramePtr; -+ SDOperand PtrOff; -+ SDOperand FIN; -+ int FI = 0; -+ // Walk the register/memloc assignments, inserting copies/loads. -+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { -+ CCValAssign &VA = ArgLocs[i]; -+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); -+ -+ .... -+ -+ if (VA.isRegLoc()) { -+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); -+ } else { -+ assert(VA.isMemLoc()); -+ // create frame index -+ int32_t Offset = VA.getLocMemOffset()+FPDiff; -+ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; -+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); -+ FIN = DAG.getFrameIndex(FI, MVT::i32); -+ // store relative to framepointer -+ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0)); -+ } -+ } //===---------------------------------------------------------------------===// main () @@ -1503,7 +1131,7 @@ Should compile into: _foo: movzwl 4(%esp), %eax - orb $-1, %al ;; 'orl 255' is also fine :) + orl $255, %eax ret instead of: @@ -1515,23 +1143,44 @@ _foo: //===---------------------------------------------------------------------===// -We're missing an obvious fold of a load into imul: +We're codegen'ing multiply of long longs inefficiently: -int test(long a, long b) { return a * b; } +unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { + return arg1 * arg2; +} -LLVM produces: -_test: - movl 4(%esp), %ecx - movl 8(%esp), %eax - imull %ecx, %eax - ret +We compile to (fomit-frame-pointer): -vs: -_test: - movl 8(%esp), %eax - imull 4(%esp), %eax +_LLM: + pushl %esi + movl 8(%esp), %ecx + movl 16(%esp), %esi + movl %esi, %eax + mull %ecx + imull 12(%esp), %esi + addl %edx, %esi + imull 20(%esp), %ecx + movl %esi, %edx + addl %ecx, %edx + popl %esi + ret + +This looks like a scheduling deficiency and lack of remat of the load from +the argument area. ICC apparently produces: + + movl 8(%esp), %ecx + imull 12(%esp), %ecx + movl 16(%esp), %eax + imull 4(%esp), %eax + addl %eax, %ecx + movl 4(%esp), %eax + mull 12(%esp) + addl %ecx, %edx ret +Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 + //===---------------------------------------------------------------------===// We can fold a store into "zeroing a reg". Instead of: @@ -1545,6 +1194,9 @@ movl $0, 124(%esp) if the flags of the xor are dead. +Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should +be folded into: shl [mem], 1 + //===---------------------------------------------------------------------===// This testcase misses a read/modify/write opportunity (from PR1425): @@ -1597,3 +1249,686 @@ a stride-4 IV, would would allow all the scales in the loop to go away. This would result in smaller code and more efficient microops. //===---------------------------------------------------------------------===// + +In SSE mode, we turn abs and neg into a load from the constant pool plus a xor +or and instruction, for example: + + xorpd LCPI1_0, %xmm2 + +However, if xmm2 gets spilled, we end up with really ugly code like this: + + movsd (%esp), %xmm0 + xorpd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + +Since we 'know' that this is a 'neg', we can actually "fold" the spill into +the neg/abs instruction, turning it into an *integer* operation, like this: + + xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) + +you could also use xorb, but xorl is less likely to lead to a partial register +stall. Here is a contrived testcase: + +double a, b, c; +void test(double *P) { + double X = *P; + a = X; + bar(); + X = -X; + b = X; + bar(); + c = X; +} + +//===---------------------------------------------------------------------===// + +handling llvm.memory.barrier on pre SSE2 cpus + +should generate: +lock ; mov %esp, %esp + +//===---------------------------------------------------------------------===// + +The generated code on x86 for checking for signed overflow on a multiply the +obvious way is much longer than it needs to be. + +int x(int a, int b) { + long long prod = (long long)a*b; + return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); +} + +See PR2053 for more details. + +//===---------------------------------------------------------------------===// + +We should investigate using cdq/ctld (effect: edx = sar eax, 31) +more aggressively; it should cost the same as a move+shift on any modern +processor, but it's a lot shorter. Downside is that it puts more +pressure on register allocation because it has fixed operands. + +Example: +int abs(int x) {return x < 0 ? -x : x;} + +gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: +abs: + movl 4(%esp), %eax + cltd + xorl %edx, %eax + subl %edx, %eax + ret + +//===---------------------------------------------------------------------===// + +Consider: +int test(unsigned long a, unsigned long b) { return -(a < b); } + +We currently compile this to: + +define i32 @test(i32 %a, i32 %b) nounwind { + %tmp3 = icmp ult i32 %a, %b ; [#uses=1] + %tmp34 = zext i1 %tmp3 to i32 ; [#uses=1] + %tmp5 = sub i32 0, %tmp34 ; [#uses=1] + ret i32 %tmp5 +} + +and + +_test: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + setb %al + movzbl %al, %eax + negl %eax + ret + +Several deficiencies here. First, we should instcombine zext+neg into sext: + +define i32 @test2(i32 %a, i32 %b) nounwind { + %tmp3 = icmp ult i32 %a, %b ; [#uses=1] + %tmp34 = sext i1 %tmp3 to i32 ; [#uses=1] + ret i32 %tmp34 +} + +However, before we can do that, we have to fix the bad codegen that we get for +sext from bool: + +_test2: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + setb %al + movzbl %al, %eax + shll $31, %eax + sarl $31, %eax + ret + +This code should be at least as good as the code above. Once this is fixed, we +can optimize this specific case even more to: + + movl 8(%esp), %eax + xorl %ecx, %ecx + cmpl %eax, 4(%esp) + sbbl %ecx, %ecx + +//===---------------------------------------------------------------------===// + +Take the following code (from +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): + +extern unsigned char first_one[65536]; +int FirstOnet(unsigned long long arg1) +{ + if (arg1 >> 48) + return (first_one[arg1 >> 48]); + return 0; +} + + +The following code is currently generated: +FirstOnet: + movl 8(%esp), %eax + cmpl $65536, %eax + movl 4(%esp), %ecx + jb .LBB1_2 # UnifiedReturnBlock +.LBB1_1: # ifthen + shrl $16, %eax + movzbl first_one(%eax), %eax + ret +.LBB1_2: # UnifiedReturnBlock + xorl %eax, %eax + ret + +There are a few possible improvements here: +1. We should be able to eliminate the dead load into %ecx +2. We could change the "movl 8(%esp), %eax" into + "movzwl 10(%esp), %eax"; this lets us change the cmpl + into a testl, which is shorter, and eliminate the shift. + +We could also in theory eliminate the branch by using a conditional +for the address of the load, but that seems unlikely to be worthwhile +in general. + +//===---------------------------------------------------------------------===// + +We compile this function: + +define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { +entry: + %tmp2 = icmp eq i8 %d, 0 ; [#uses=1] + br i1 %tmp2, label %bb7, label %bb + +bb: ; preds = %entry + %tmp6 = add i32 %b, %a ; [#uses=1] + ret i32 %tmp6 + +bb7: ; preds = %entry + %tmp10 = sub i32 %a, %c ; [#uses=1] + ret i32 %tmp10 +} + +to: + +_foo: + cmpb $0, 16(%esp) + movl 12(%esp), %ecx + movl 8(%esp), %eax + movl 4(%esp), %edx + je LBB1_2 # bb7 +LBB1_1: # bb + addl %edx, %eax + ret +LBB1_2: # bb7 + movl %edx, %eax + subl %ecx, %eax + ret + +The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2 +if it commuted the addl in LBB1_1. + +//===---------------------------------------------------------------------===// + +See rdar://4653682. + +From flops: + +LBB1_15: # bb310 + cvtss2sd LCPI1_0, %xmm1 + addsd %xmm1, %xmm0 + movsd 176(%esp), %xmm2 + mulsd %xmm0, %xmm2 + movapd %xmm2, %xmm3 + mulsd %xmm3, %xmm3 + movapd %xmm3, %xmm4 + mulsd LCPI1_23, %xmm4 + addsd LCPI1_24, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_25, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_26, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_27, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_28, %xmm4 + mulsd %xmm3, %xmm4 + addsd %xmm1, %xmm4 + mulsd %xmm2, %xmm4 + movsd 152(%esp), %xmm1 + addsd %xmm4, %xmm1 + movsd %xmm1, 152(%esp) + incl %eax + cmpl %eax, %esi + jge LBB1_15 # bb310 +LBB1_16: # bb358.loopexit + movsd 152(%esp), %xmm0 + addsd %xmm0, %xmm0 + addsd LCPI1_22, %xmm0 + movsd %xmm0, 152(%esp) + +Rather than spilling the result of the last addsd in the loop, we should have +insert a copy to split the interval (one for the duration of the loop, one +extending to the fall through). The register pressure in the loop isn't high +enough to warrant the spill. + +Also check why xmm7 is not used at all in the function. + +//===---------------------------------------------------------------------===// + +Legalize loses track of the fact that bools are always zero extended when in +memory. This causes us to compile abort_gzip (from 164.gzip) from: + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i386-apple-darwin8" +@in_exit.4870.b = internal global i1 false ; [#uses=2] +define fastcc void @abort_gzip() noreturn nounwind { +entry: + %tmp.b.i = load i1* @in_exit.4870.b ; [#uses=1] + br i1 %tmp.b.i, label %bb.i, label %bb4.i +bb.i: ; preds = %entry + tail call void @exit( i32 1 ) noreturn nounwind + unreachable +bb4.i: ; preds = %entry + store i1 true, i1* @in_exit.4870.b + tail call void @exit( i32 1 ) noreturn nounwind + unreachable +} +declare void @exit(i32) noreturn nounwind + +into: + +_abort_gzip: + subl $12, %esp + movb _in_exit.4870.b, %al + notb %al + testb $1, %al + jne LBB1_2 ## bb4.i +LBB1_1: ## bb.i + ... + +//===---------------------------------------------------------------------===// + +We compile: + +int test(int x, int y) { + return x-y-1; +} + +into (-m64): + +_test: + decl %edi + movl %edi, %eax + subl %esi, %eax + ret + +it would be better to codegen as: x+~y (notl+addl) + +//===---------------------------------------------------------------------===// + +This code: + +int foo(const char *str,...) +{ + __builtin_va_list a; int x; + __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); + return x; +} + +gets compiled into this on x86-64: + subq $200, %rsp + movaps %xmm7, 160(%rsp) + movaps %xmm6, 144(%rsp) + movaps %xmm5, 128(%rsp) + movaps %xmm4, 112(%rsp) + movaps %xmm3, 96(%rsp) + movaps %xmm2, 80(%rsp) + movaps %xmm1, 64(%rsp) + movaps %xmm0, 48(%rsp) + movq %r9, 40(%rsp) + movq %r8, 32(%rsp) + movq %rcx, 24(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 8(%rsp) + leaq (%rsp), %rax + movq %rax, 192(%rsp) + leaq 208(%rsp), %rax + movq %rax, 184(%rsp) + movl $48, 180(%rsp) + movl $8, 176(%rsp) + movl 176(%rsp), %eax + cmpl $47, %eax + jbe .LBB1_3 # bb +.LBB1_1: # bb3 + movq 184(%rsp), %rcx + leaq 8(%rcx), %rax + movq %rax, 184(%rsp) +.LBB1_2: # bb4 + movl (%rcx), %eax + addq $200, %rsp + ret +.LBB1_3: # bb + movl %eax, %ecx + addl $8, %eax + addq 192(%rsp), %rcx + movl %eax, 176(%rsp) + jmp .LBB1_2 # bb4 + +gcc 4.3 generates: + subq $96, %rsp +.LCFI0: + leaq 104(%rsp), %rax + movq %rsi, -80(%rsp) + movl $8, -120(%rsp) + movq %rax, -112(%rsp) + leaq -88(%rsp), %rax + movq %rax, -104(%rsp) + movl $8, %eax + cmpl $48, %eax + jb .L6 + movq -112(%rsp), %rdx + movl (%rdx), %eax + addq $96, %rsp + ret + .p2align 4,,10 + .p2align 3 +.L6: + mov %eax, %edx + addq -104(%rsp), %rdx + addl $8, %eax + movl %eax, -120(%rsp) + movl (%rdx), %eax + addq $96, %rsp + ret + +and it gets compiled into this on x86: + pushl %ebp + movl %esp, %ebp + subl $4, %esp + leal 12(%ebp), %eax + movl %eax, -4(%ebp) + leal 16(%ebp), %eax + movl %eax, -4(%ebp) + movl 12(%ebp), %eax + addl $4, %esp + popl %ebp + ret + +gcc 4.3 generates: + pushl %ebp + movl %esp, %ebp + movl 12(%ebp), %eax + popl %ebp + ret + +//===---------------------------------------------------------------------===// + +Teach tblgen not to check bitconvert source type in some cases. This allows us +to consolidate the following patterns in X86InstrMMX.td: + +def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))), + (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; +def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))), + (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; +def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))), + (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; + +There are other cases in various td files. + +//===---------------------------------------------------------------------===// + +Take something like the following on x86-32: +unsigned a(unsigned long long x, unsigned y) {return x % y;} + +We currently generate a libcall, but we really shouldn't: the expansion is +shorter and likely faster than the libcall. The expected code is something +like the following: + + movl 12(%ebp), %eax + movl 16(%ebp), %ecx + xorl %edx, %edx + divl %ecx + movl 8(%ebp), %eax + divl %ecx + movl %edx, %eax + ret + +A similar code sequence works for division. + +//===---------------------------------------------------------------------===// + +These should compile to the same code, but the later codegen's to useless +instructions on X86. This may be a trivial dag combine (GCC PR7061): + +struct s1 { unsigned char a, b; }; +unsigned long f1(struct s1 x) { + return x.a + x.b; +} +struct s2 { unsigned a: 8, b: 8; }; +unsigned long f2(struct s2 x) { + return x.a + x.b; +} + +//===---------------------------------------------------------------------===// + +We currently compile this: + +define i32 @func1(i32 %v1, i32 %v2) nounwind { +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %sum = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %normal +normal: + ret i32 %sum +overflow: + call void @llvm.trap() + unreachable +} +declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) +declare void @llvm.trap() + +to: + +_func1: + movl 4(%esp), %eax + addl 8(%esp), %eax + jo LBB1_2 ## overflow +LBB1_1: ## normal + ret +LBB1_2: ## overflow + ud2 + +it would be nice to produce "into" someday. + +//===---------------------------------------------------------------------===// + +This code: + +void vec_mpys1(int y[], const int x[], int scaler) { +int i; +for (i = 0; i < 150; i++) + y[i] += (((long long)scaler * (long long)x[i]) >> 31); +} + +Compiles to this loop with GCC 3.x: + +.L5: + movl %ebx, %eax + imull (%edi,%ecx,4) + shrdl $31, %edx, %eax + addl %eax, (%esi,%ecx,4) + incl %ecx + cmpl $149, %ecx + jle .L5 + +llvm-gcc compiles it to the much uglier: + +LBB1_1: ## bb1 + movl 24(%esp), %eax + movl (%eax,%edi,4), %ebx + movl %ebx, %ebp + imull %esi, %ebp + movl %ebx, %eax + mull %ecx + addl %ebp, %edx + sarl $31, %ebx + imull %ecx, %ebx + addl %edx, %ebx + shldl $1, %eax, %ebx + movl 20(%esp), %eax + addl %ebx, (%eax,%edi,4) + incl %edi + cmpl $150, %edi + jne LBB1_1 ## bb1 + +//===---------------------------------------------------------------------===// + +Test instructions can be eliminated by using EFLAGS values from arithmetic +instructions. This is currently not done for mul, and, or, xor, neg, shl, +sra, srl, shld, shrd, atomic ops, and others. It is also currently not done +for read-modify-write instructions. It is also current not done if the +OF or CF flags are needed. + +The shift operators have the complication that when the shift count is +zero, EFLAGS is not set, so they can only subsume a test instruction if +the shift count is known to be non-zero. Also, using the EFLAGS value +from a shift is apparently very slow on some x86 implementations. + +In read-modify-write instructions, the root node in the isel match is +the store, and isel has no way for the use of the EFLAGS result of the +arithmetic to be remapped to the new node. + +Add and subtract instructions set OF on signed overflow and CF on unsiged +overflow, while test instructions always clear OF and CF. In order to +replace a test with an add or subtract in a situation where OF or CF is +needed, codegen must be able to prove that the operation cannot see +signed or unsigned overflow, respectively. + +//===---------------------------------------------------------------------===// + +memcpy/memmove do not lower to SSE copies when possible. A silly example is: +define <16 x float> @foo(<16 x float> %A) nounwind { + %tmp = alloca <16 x float>, align 16 + %tmp2 = alloca <16 x float>, align 16 + store <16 x float> %A, <16 x float>* %tmp + %s = bitcast <16 x float>* %tmp to i8* + %s2 = bitcast <16 x float>* %tmp2 to i8* + call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) + %R = load <16 x float>* %tmp2 + ret <16 x float> %R +} + +declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind + +which compiles to: + +_foo: + subl $140, %esp + movaps %xmm3, 112(%esp) + movaps %xmm2, 96(%esp) + movaps %xmm1, 80(%esp) + movaps %xmm0, 64(%esp) + movl 60(%esp), %eax + movl %eax, 124(%esp) + movl 56(%esp), %eax + movl %eax, 120(%esp) + movl 52(%esp), %eax + + movaps (%esp), %xmm0 + movaps 16(%esp), %xmm1 + movaps 32(%esp), %xmm2 + movaps 48(%esp), %xmm3 + addl $140, %esp + ret + +On Nehalem, it may even be cheaper to just use movups when unaligned than to +fall back to lower-granularity chunks. + +//===---------------------------------------------------------------------===// + +Implement processor-specific optimizations for parity with GCC on these +processors. GCC does two optimizations: + +1. ix86_pad_returns inserts a noop before ret instructions if immediately + preceeded by a conditional branch or is the target of a jump. +2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of + code contains more than 3 branches. + +The first one is done for all AMDs, Core2, and "Generic" +The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, + Core 2, and "Generic" + +//===---------------------------------------------------------------------===// + +Testcase: +int a(int x) { return (x & 127) > 31; } + +Current output: + movl 4(%esp), %eax + andl $127, %eax + cmpl $31, %eax + seta %al + movzbl %al, %eax + ret + +Ideal output: + xorl %eax, %eax + testl $96, 4(%esp) + setne %al + ret + +This should definitely be done in instcombine, canonicalizing the range +condition into a != condition. We get this IR: + +define i32 @a(i32 %x) nounwind readnone { +entry: + %0 = and i32 %x, 127 ; [#uses=1] + %1 = icmp ugt i32 %0, 31 ; [#uses=1] + %2 = zext i1 %1 to i32 ; [#uses=1] + ret i32 %2 +} + +Instcombine prefers to strength reduce relational comparisons to equality +comparisons when possible, this should be another case of that. This could +be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it +looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already +be redesigned to use ComputeMaskedBits and friends. + + +//===---------------------------------------------------------------------===// +Testcase: +int x(int a) { return (a&0xf0)>>4; } + +Current output: + movl 4(%esp), %eax + shrl $4, %eax + andl $15, %eax + ret + +Ideal output: + movzbl 4(%esp), %eax + shrl $4, %eax + ret + +//===---------------------------------------------------------------------===// + +Testcase: +int x(int a) { return (a & 0x80) ? 0x100 : 0; } +int y(int a) { return (a & 0x80) *2; } + +Current: + testl $128, 4(%esp) + setne %al + movzbl %al, %eax + shll $8, %eax + ret + +Better: + movl 4(%esp), %eax + addl %eax, %eax + andl $256, %eax + ret + +This is another general instcombine transformation that is profitable on all +targets. In LLVM IR, these functions look like this: + +define i32 @x(i32 %a) nounwind readnone { +entry: + %0 = and i32 %a, 128 + %1 = icmp eq i32 %0, 0 + %iftmp.0.0 = select i1 %1, i32 0, i32 256 + ret i32 %iftmp.0.0 +} + +define i32 @y(i32 %a) nounwind readnone { +entry: + %0 = shl i32 %a, 1 + %1 = and i32 %0, 256 + ret i32 %1 +} + +Replacing an icmp+select with a shift should always be considered profitable in +instcombine. + +//===---------------------------------------------------------------------===//