X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;ds=sidebyside;f=lib%2FTarget%2FX86%2FREADME.txt;h=6a8a4fdf2520d81a8b1f5e9828f69cc6b2bcd024;hb=fef904d0e824a2c587f8c1063b6c4fbf47fec898;hp=767771f082a6d59955248b5e7eb344ccfb88691b;hpb=d23fffeb1603558dc79c509c7dde195acf7e5e61;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 767771f082a..6a8a4fdf252 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,19 +2,6 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// -We should add support for the "movbe" instruction, which does a byte-swapping -copy (3-addr bswap + memory support?) This is available on Atom processors. - -//===---------------------------------------------------------------------===// - -CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 -backend knows how to three-addressify this shift, but it appears the register -allocator isn't even asking it to do so in this case. We should investigate -why this isn't happening, it could have significant impact on other important -cases for X86 as well. - -//===---------------------------------------------------------------------===// - This should be one DIV/IDIV instruction, not a libcall: unsigned test(unsigned long long X, unsigned Y) { @@ -67,22 +54,9 @@ cmovs, we should expand to a conditional branch like GCC produces. //===---------------------------------------------------------------------===// -Compile this: -_Bool f(_Bool a) { return a!=1; } - -into: - movzbl %dil, %eax - xorl $1, %eax - ret - -(Although note that this isn't a legal way to express the code that llvm-gcc -currently generates for that function.) - -//===---------------------------------------------------------------------===// - Some isel ideas: -1. Dynamic programming based approach when compile time if not an +1. Dynamic programming based approach when compile time is not an issue. 2. Code duplication (addressing mode) during isel. 3. Other ideas from "Register-Sensitive Selection, Duplication, and @@ -109,6 +83,37 @@ It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// +This: + +void foo(void); +void bar(int x, int *P) { + x >>= 2; + if (x) + foo(); + *P = x; +} + +compiles into: + + movq %rsi, %rbx + movl %edi, %r14d + sarl $2, %r14d + testl %r14d, %r14d + je LBB0_2 + +Instead of doing an explicit test, we can use the flags off the sar. This +occurs in a bigger testcase like this, which is pretty common: + +#include +int test1(std::vector &X) { + int Sum = 0; + for (long i = 0, e = X.size(); i != e; ++i) + X[i] = 0; + return Sum; +} + +//===---------------------------------------------------------------------===// + Only use inc/neg/not instructions on processors where they are faster than add/sub/xor. They are slower on the P4 due to only updating some processor flags. @@ -123,20 +128,6 @@ when it can invert the result of the compare for free. //===---------------------------------------------------------------------===// -How about intrinsics? An example is: - *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C)); - -compiles to - pmuludq (%eax), %xmm0 - movl 8(%esp), %eax - movdqa (%eax), %xmm1 - pmulhuw %xmm0, %xmm1 - -The transformation probably requires a X86 specific pass or a DAG combiner -target specific hook. - -//===---------------------------------------------------------------------===// - In many cases, LLVM generates code like this: _test: @@ -241,11 +232,6 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. //===---------------------------------------------------------------------===// -Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / -FR64 to VR128. - -//===---------------------------------------------------------------------===// - Adding to the list of cmp / test poor codegen issues: int test(__m128 *A, __m128 *B) { @@ -413,72 +399,8 @@ boundary to improve performance. //===---------------------------------------------------------------------===// -Codegen: - -int f(int a, int b) { - if (a == 4 || a == 6) - b++; - return b; -} - - -as: - -or eax, 2 -cmp eax, 6 -jz label - -//===---------------------------------------------------------------------===// - GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting -simplifications for integer "x cmp y ? a : b". For example, instead of: - -int G; -void f(int X, int Y) { - G = X < 0 ? 14 : 13; -} - -compiling to: - -_f: - movl $14, %eax - movl $13, %ecx - movl 4(%esp), %edx - testl %edx, %edx - cmovl %eax, %ecx - movl %ecx, _G - ret - -it could be: -_f: - movl 4(%esp), %eax - sarl $31, %eax - notl %eax - addl $14, %eax - movl %eax, _G - ret - -etc. - -Another is: -int usesbb(unsigned int a, unsigned int b) { - return (a < b ? -1 : 0); -} -to: -_usesbb: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - sbbl %eax, %eax - ret - -instead of: -_usesbb: - xorl %eax, %eax - movl 8(%esp), %ecx - cmpl %ecx, 4(%esp) - movl $4294967295, %ecx - cmovb %ecx, %eax - ret +simplifications for integer "x cmp y ? a : b". //===---------------------------------------------------------------------===// @@ -544,7 +466,7 @@ We should inline lrintf and probably other libc functions. //===---------------------------------------------------------------------===// -Start using the flags more. For example, compile: +Use the FLAGS values from arithmetic instructions more. For example, compile: int add_zf(int *x, int y, int a, int b) { if ((*x += y) == 0) @@ -568,31 +490,8 @@ _add_zf: movl %ecx, %eax ret -and: - -int add_zf(int *x, int y, int a, int b) { - if ((*x + y) < 0) - return a; - else - return b; -} - -to: - -add_zf: - addl (%rdi), %esi - movl %edx, %eax - cmovns %ecx, %eax - ret - -instead of: - -_add_zf: - addl (%rdi), %esi - testl %esi, %esi - cmovs %edx, %ecx - movl %ecx, %eax - ret +As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll +without a test instruction. //===---------------------------------------------------------------------===// @@ -699,55 +598,6 @@ Though this probably isn't worth it. //===---------------------------------------------------------------------===// -We need to teach the codegen to convert two-address INC instructions to LEA -when the flags are dead (likewise dec). For example, on X86-64, compile: - -int foo(int A, int B) { - return A+1; -} - -to: - -_foo: - leal 1(%edi), %eax - ret - -instead of: - -_foo: - incl %edi - movl %edi, %eax - ret - -Another example is: - -;; X's live range extends beyond the shift, so the register allocator -;; cannot coalesce it with Y. Because of this, a copy needs to be -;; emitted before the shift to save the register value before it is -;; clobbered. However, this copy is not needed if the register -;; allocator turns the shift into an LEA. This also occurs for ADD. - -; Check that the shift gets turned into an LEA. -; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \ -; RUN: not grep {mov E.X, E.X} - -@G = external global i32 ; [#uses=3] - -define i32 @test1(i32 %X, i32 %Y) { - %Z = add i32 %X, %Y ; [#uses=1] - volatile store i32 %Y, i32* @G - volatile store i32 %Z, i32* @G - ret i32 %X -} - -define i32 @test2(i32 %X) { - %Z = add i32 %X, 1 ; [#uses=1] - volatile store i32 %Z, i32* @G - ret i32 %X -} - -//===---------------------------------------------------------------------===// - Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with a neg instead of a sub instruction. Consider: @@ -847,29 +697,18 @@ This: { return !full_add(a, b).second; } Should compile to: + addl %esi, %edi + setae %al + movzbl %al, %eax + ret +on x86-64, instead of the rather stupid-looking: + addl %esi, %edi + setb %al + xorb $1, %al + movzbl %al, %eax + ret - _Z11no_overflowjj: - addl %edi, %esi - setae %al - ret - -FIXME: That code looks wrong; bool return is normally defined as zext. - -on x86-64, not: - -__Z11no_overflowjj: - addl %edi, %esi - cmpl %edi, %esi - setae %al - movzbl %al, %eax - ret - - -//===---------------------------------------------------------------------===// - -Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the -condition register is dead. xor reg reg is shorter than mov reg, #0. //===---------------------------------------------------------------------===// @@ -1136,10 +975,10 @@ _foo: instead of: _foo: - movl $255, %eax - orl 4(%esp), %eax - andl $65535, %eax - ret + movl $65280, %eax + andl 4(%esp), %eax + orl $255, %eax + ret //===---------------------------------------------------------------------===// @@ -1199,57 +1038,6 @@ be folded into: shl [mem], 1 //===---------------------------------------------------------------------===// -This testcase misses a read/modify/write opportunity (from PR1425): - -void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){ - int i; - for(i=0; i>0; -} - -We compile it down to: - -LBB1_2: # bb - movl (%esi,%edi,4), %ebx - addl (%ecx,%edi,4), %ebx - addl (%edx,%edi,4), %ebx - movl %ebx, (%ecx,%edi,4) - incl %edi - cmpl %eax, %edi - jne LBB1_2 # bb - -the inner loop should add to the memory location (%ecx,%edi,4), saving -a mov. Something like: - - movl (%esi,%edi,4), %ebx - addl (%edx,%edi,4), %ebx - addl %ebx, (%ecx,%edi,4) - -Here is another interesting example: - -void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){ - int i; - for(i=0; i>0; -} - -We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]: - -LBB9_2: # bb - movl (%ecx,%edi,4), %ebx - subl (%esi,%edi,4), %ebx - subl (%edx,%edi,4), %ebx - movl %ebx, (%ecx,%edi,4) - incl %edi - cmpl %eax, %edi - jne LBB9_2 # bb - -Additionally, LSR should rewrite the exit condition of these loops to use -a stride-4 IV, would would allow all the scales in the loop to go away. -This would result in smaller code and more efficient microops. - -//===---------------------------------------------------------------------===// - In SSE mode, we turn abs and neg into a load from the constant pool plus a xor or and instruction, for example: @@ -1282,13 +1070,6 @@ void test(double *P) { //===---------------------------------------------------------------------===// -handling llvm.memory.barrier on pre SSE2 cpus - -should generate: -lock ; mov %esp, %esp - -//===---------------------------------------------------------------------===// - The generated code on x86 for checking for signed overflow on a multiply the obvious way is much longer than it needs to be. @@ -1319,58 +1100,6 @@ abs: //===---------------------------------------------------------------------===// -Consider: -int test(unsigned long a, unsigned long b) { return -(a < b); } - -We currently compile this to: - -define i32 @test(i32 %a, i32 %b) nounwind { - %tmp3 = icmp ult i32 %a, %b ; [#uses=1] - %tmp34 = zext i1 %tmp3 to i32 ; [#uses=1] - %tmp5 = sub i32 0, %tmp34 ; [#uses=1] - ret i32 %tmp5 -} - -and - -_test: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setb %al - movzbl %al, %eax - negl %eax - ret - -Several deficiencies here. First, we should instcombine zext+neg into sext: - -define i32 @test2(i32 %a, i32 %b) nounwind { - %tmp3 = icmp ult i32 %a, %b ; [#uses=1] - %tmp34 = sext i1 %tmp3 to i32 ; [#uses=1] - ret i32 %tmp34 -} - -However, before we can do that, we have to fix the bad codegen that we get for -sext from bool: - -_test2: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setb %al - movzbl %al, %eax - shll $31, %eax - sarl $31, %eax - ret - -This code should be at least as good as the code above. Once this is fixed, we -can optimize this specific case even more to: - - movl 8(%esp), %eax - xorl %ecx, %ecx - cmpl %eax, 4(%esp) - sbbl %ecx, %ecx - -//===---------------------------------------------------------------------===// - Take the following code (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): @@ -1397,15 +1126,8 @@ FirstOnet: xorl %eax, %eax ret -There are a few possible improvements here: -1. We should be able to eliminate the dead load into %ecx -2. We could change the "movl 8(%esp), %eax" into - "movzwl 10(%esp), %eax"; this lets us change the cmpl - into a testl, which is shorter, and eliminate the shift. - -We could also in theory eliminate the branch by using a conditional -for the address of the load, but that seems unlikely to be worthwhile -in general. +We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this +lets us change the cmpl into a testl, which is shorter, and eliminate the shift. //===---------------------------------------------------------------------===// @@ -1427,22 +1149,23 @@ bb7: ; preds = %entry to: -_foo: +foo: # @foo +# BB#0: # %entry + movl 4(%esp), %ecx cmpb $0, 16(%esp) - movl 12(%esp), %ecx + je .LBB0_2 +# BB#1: # %bb movl 8(%esp), %eax - movl 4(%esp), %edx - je LBB1_2 # bb7 -LBB1_1: # bb - addl %edx, %eax + addl %ecx, %eax ret -LBB1_2: # bb7 - movl %edx, %eax - subl %ecx, %eax +.LBB0_2: # %bb7 + movl 12(%esp), %edx + movl %ecx, %eax + subl %edx, %eax ret -The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2 -if it commuted the addl in LBB1_1. +There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a +couple more movls by putting 4(%esp) into %eax instead of %ecx. //===---------------------------------------------------------------------===// @@ -1492,10 +1215,9 @@ Also check why xmm7 is not used at all in the function. //===---------------------------------------------------------------------===// -Legalize loses track of the fact that bools are always zero extended when in -memory. This causes us to compile abort_gzip (from 164.gzip) from: +Take the following: -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" target triple = "i386-apple-darwin8" @in_exit.4870.b = internal global i1 false ; [#uses=2] define fastcc void @abort_gzip() noreturn nounwind { @@ -1512,16 +1234,15 @@ bb4.i: ; preds = %entry } declare void @exit(i32) noreturn nounwind -into: - -_abort_gzip: +This compiles into: +_abort_gzip: ## @abort_gzip +## BB#0: ## %entry subl $12, %esp movb _in_exit.4870.b, %al - notb %al - testb $1, %al - jne LBB1_2 ## bb4.i -LBB1_1: ## bb.i - ... + cmpb $1, %al + jne LBB0_2 + +We somehow miss folding the movb into the cmpb. //===---------------------------------------------------------------------===// @@ -1762,6 +1483,13 @@ LBB1_1: ## bb1 cmpl $150, %edi jne LBB1_1 ## bb1 +The issue is that we hoist the cast of "scaler" to long long outside of the +loop, the value comes into the loop as two values, and +RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the +constructed BUILD_PAIR which represents the cast value. + +This can be handled by making CodeGenPrepare sink the cast. + //===---------------------------------------------------------------------===// Test instructions can be eliminated by using EFLAGS values from arithmetic @@ -1831,7 +1559,7 @@ Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations: 1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceeded by a conditional branch or is the target of a jump. + preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. @@ -1893,22 +1621,460 @@ Ideal output: //===---------------------------------------------------------------------===// -Testcase: -int x(int a) { return (a & 0x80) ? 0x100 : 0; } +Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch +properly. -Current output: - testl $128, 4(%esp) - setne %al +When the return value is not used (i.e. only care about the value in the +memory), x86 does not have to use add to implement these. Instead, it can use +add, sub, inc, dec instructions with the "lock" prefix. + +This is currently implemented using a bit of instruction selection trick. The +issue is the target independent pattern produces one output and a chain and we +want to map it into one that just output a chain. The current trick is to select +it into a MERGE_VALUES with the first definition being an implicit_def. The +proper solution is to add new ISD opcodes for the no-output variant. DAG +combiner can then transform the node before it gets to target node selection. + +Problem #2 is we are adding a whole bunch of x86 atomic instructions when in +fact these instructions are identical to the non-lock versions. We need a way to +add target specific information to target nodes and have this information +carried over to machine instructions. Asm printer (or JIT) can use this +information to add the "lock" prefix. + +//===---------------------------------------------------------------------===// + +struct B { + unsigned char y0 : 1; +}; + +int bar(struct B* a) { return a->y0; } + +define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { + %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 + %2 = load i8* %1, align 1 + %3 = and i8 %2, 1 + %4 = zext i8 %3 to i32 + ret i32 %4 +} + +bar: # @bar +# BB#0: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret + +Missed optimization: should be movl+andl. + +//===---------------------------------------------------------------------===// + +The x86_64 abi says: + +Booleans, when stored in a memory object, are stored as single byte objects the +value of which is always 0 (false) or 1 (true). + +We are not using this fact: + +int bar(_Bool *a) { return *a; } + +define i32 @bar(i8* nocapture %a) nounwind readonly optsize { + %1 = load i8* %a, align 1, !tbaa !0 + %tmp = and i8 %1, 1 + %2 = zext i8 %tmp to i32 + ret i32 %2 +} + +bar: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret + +GCC produces + +bar: + movzbl (%rdi), %eax + ret + +//===---------------------------------------------------------------------===// + +Consider the following two functions compiled with clang: +_Bool foo(int *x) { return !(*x & 4); } +unsigned bar(int *x) { return !(*x & 4); } + +foo: + movl 4(%esp), %eax + testb $4, (%eax) + sete %al movzbl %al, %eax - shll $8, %eax ret -Ideal output: +bar: + movl 4(%esp), %eax + movl (%eax), %eax + shrl $2, %eax + andl $1, %eax + xorl $1, %eax + ret + +The second function generates more code even though the two functions are +are functionally identical. + +//===---------------------------------------------------------------------===// + +Take the following C code: +int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %tmp = xor i32 %b, %a ; [#uses=1] + %tmp6 = and i32 %tmp, 255 ; [#uses=1] + %cmp = icmp eq i32 %tmp6, 0 ; [#uses=1] + %conv5 = zext i1 %cmp to i32 ; [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + xorl %esi, %edi + testb $-1, %dil + sete %al + movzbl %al, %eax + ret + +A cmpb instead of the xorl+testb would be one instruction shorter. + +//===---------------------------------------------------------------------===// + +Given the following C code: +int f(int a, int b) { return (signed char)a == (signed char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %sext = shl i32 %a, 24 ; [#uses=1] + %conv1 = ashr i32 %sext, 24 ; [#uses=1] + %sext6 = shl i32 %b, 24 ; [#uses=1] + %conv4 = ashr i32 %sext6, 24 ; [#uses=1] + %cmp = icmp eq i32 %conv1, %conv4 ; [#uses=1] + %conv5 = zext i1 %cmp to i32 ; [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + movsbl %sil, %eax + movsbl %dil, %ecx + cmpl %eax, %ecx + sete %al + movzbl %al, %eax + ret + + +It should be possible to eliminate the sign extensions. + +//===---------------------------------------------------------------------===// + +LLVM misses a load+store narrowing opportunity in this code: + +%struct.bf = type { i64, i16, i16, i32 } + +@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] + +define void @t1() nounwind ssp { +entry: + %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; [#uses=1] + %2 = bitcast i16* %1 to i32* ; [#uses=2] + %3 = load i32* %2, align 1 ; [#uses=1] + %4 = and i32 %3, -65537 ; [#uses=1] + store i32 %4, i32* %2, align 1 + %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; [#uses=1] + %7 = bitcast i16* %6 to i32* ; [#uses=2] + %8 = load i32* %7, align 1 ; [#uses=1] + %9 = and i32 %8, -131073 ; [#uses=1] + store i32 %9, i32* %7, align 1 + ret void +} + +LLVM currently emits this: + + movq bfi(%rip), %rax + andl $-65537, 8(%rax) + movq bfi(%rip), %rax + andl $-131073, 8(%rax) + ret + +It could narrow the loads and stores to emit this: + + movq bfi(%rip), %rax + andb $-2, 10(%rax) + movq bfi(%rip), %rax + andb $-3, 10(%rax) + ret + +The trouble is that there is a TokenFactor between the store and the +load, making it non-trivial to determine if there's anything between +the load and the store which would prohibit narrowing. + +//===---------------------------------------------------------------------===// + +This code: +void foo(unsigned x) { + if (x == 0) bar(); + else if (x == 1) qux(); +} + +currently compiles into: +_foo: movl 4(%esp), %eax - addl %eax, %eax - andl $256, %eax + cmpl $1, %eax + je LBB0_3 + testl %eax, %eax + jne LBB0_4 + +the testl could be removed: +_foo: + movl 4(%esp), %eax + cmpl $1, %eax + je LBB0_3 + jb LBB0_4 + +0 is the only unsigned number < 1. + +//===---------------------------------------------------------------------===// + +This code: + +%0 = type { i32, i1 } + +define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { +entry: + %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) + %cmp = extractvalue %0 %uadd, 1 + %inc = zext i1 %cmp to i32 + %add = add i32 %x, %sum + %z.0 = add i32 %add, %inc + ret i32 %z.0 +} + +declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone + +compiles to: + +_add32carry: ## @add32carry + addl %esi, %edi + sbbl %ecx, %ecx + movl %edi, %eax + subl %ecx, %eax ret -We generally want to fold shifted tests of a single bit into a shift+and on x86. +But it could be: + +_add32carry: + leal (%rsi,%rdi), %eax + cmpl %esi, %eax + adcl $0, %eax + ret + +//===---------------------------------------------------------------------===// + +The hot loop of 256.bzip2 contains code that looks a bit like this: + +int foo(char *P, char *Q, int x, int y) { + if (P[0] != Q[0]) + return P[0] < Q[0]; + if (P[1] != Q[1]) + return P[1] < Q[1]; + if (P[2] != Q[2]) + return P[2] < Q[2]; + return P[3] < Q[3]; +} + +In the real code, we get a lot more wrong than this. However, even in this +code we generate: + +_foo: ## @foo +## BB#0: ## %entry + movb (%rsi), %al + movb (%rdi), %cl + cmpb %al, %cl + je LBB0_2 +LBB0_1: ## %if.then + cmpb %al, %cl + jmp LBB0_5 +LBB0_2: ## %if.end + movb 1(%rsi), %al + movb 1(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## BB#3: ## %if.end38 + movb 2(%rsi), %al + movb 2(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## BB#4: ## %if.end60 + movb 3(%rdi), %al + cmpb 3(%rsi), %al +LBB0_5: ## %if.end60 + setl %al + movzbl %al, %eax + ret + +Note that we generate jumps to LBB0_1 which does a redundant compare. The +redundant compare also forces the register values to be live, which prevents +folding one of the loads into the compare. In contrast, GCC 4.2 produces: + +_foo: + movzbl (%rsi), %eax + cmpb %al, (%rdi) + jne L10 +L12: + movzbl 1(%rsi), %eax + cmpb %al, 1(%rdi) + jne L10 + movzbl 2(%rsi), %eax + cmpb %al, 2(%rdi) + jne L10 + movzbl 3(%rdi), %eax + cmpb 3(%rsi), %al +L10: + setl %al + movzbl %al, %eax + ret + +which is "perfect". + +//===---------------------------------------------------------------------===// + +For the branch in the following code: +int a(); +int b(int x, int y) { + if (x & (1<<(y&7))) + return a(); + return y; +} + +We currently generate: + movb %sil, %al + andb $7, %al + movzbl %al, %eax + btl %eax, %edi + jae .LBB0_2 + +movl+andl would be shorter than the movb+andb+movzbl sequence. + +//===---------------------------------------------------------------------===// + +For the following: +struct u1 { + float x, y; +}; +float foo(struct u1 u) { + return u.x + u.y; +} + +We currently generate: + movdqa %xmm0, %xmm1 + pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] + addss %xmm1, %xmm0 + ret + +We could save an instruction here by commuting the addss. + +//===---------------------------------------------------------------------===// + +This (from PR9661): + +float clamp_float(float a) { + if (a > 1.0f) + return 1.0f; + else if (a < 0.0f) + return 0.0f; + else + return a; +} + +Could compile to: + +clamp_float: # @clamp_float + movss .LCPI0_0(%rip), %xmm1 + minss %xmm1, %xmm0 + pxor %xmm1, %xmm1 + maxss %xmm1, %xmm0 + ret + +with -ffast-math. + +//===---------------------------------------------------------------------===// + +This function (from PR9803): + +int clamp2(int a) { + if (a > 5) + a = 5; + if (a < 0) + return 0; + return a; +} + +Compiles to: + +_clamp2: ## @clamp2 + pushq %rbp + movq %rsp, %rbp + cmpl $5, %edi + movl $5, %ecx + cmovlel %edi, %ecx + testl %ecx, %ecx + movl $0, %eax + cmovnsl %ecx, %eax + popq %rbp + ret + +The move of 0 could be scheduled above the test to make it is xor reg,reg. + +//===---------------------------------------------------------------------===// + +GCC PR48986. We currently compile this: + +void bar(void); +void yyy(int* p) { + if (__sync_fetch_and_add(p, -1) == 1) + bar(); +} + +into: + movl $-1, %eax + lock + xaddl %eax, (%rdi) + cmpl $1, %eax + je LBB0_2 + +Instead we could generate: + + lock + dec %rdi + je LBB0_2 + +The trick is to match "fetch_and_add(X, -C) == C". + +//===---------------------------------------------------------------------===// + +unsigned t(unsigned a, unsigned b) { + return a <= b ? 5 : -5; +} + +We generate: + movl $5, %ecx + cmpl %esi, %edi + movl $-5, %eax + cmovbel %ecx, %eax + +GCC: + cmpl %edi, %esi + sbbl %eax, %eax + andl $-10, %eax + addl $5, %eax //===---------------------------------------------------------------------===//