// Random ideas for the X86 backend.
//===---------------------------------------------------------------------===//
-We should add support for the "movbe" instruction, which does a byte-swapping
-copy (3-addr bswap + memory support?) This is available on Atom processors.
-
-//===---------------------------------------------------------------------===//
-
-CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
-backend knows how to three-addressify this shift, but it appears the register
-allocator isn't even asking it to do so in this case. We should investigate
-why this isn't happening, it could have significant impact on other important
-cases for X86 as well.
-
-//===---------------------------------------------------------------------===//
-
This should be one DIV/IDIV instruction, not a libcall:
unsigned test(unsigned long long X, unsigned Y) {
Some isel ideas:
-1. Dynamic programming based approach when compile time if not an
+1. Dynamic programming based approach when compile time is not an
issue.
2. Code duplication (addressing mode) during isel.
3. Other ideas from "Register-Sensitive Selection, Duplication, and
{ return !full_add(a, b).second; }
Should compile to:
+ addl %esi, %edi
+ setae %al
+ movzbl %al, %eax
+ ret
-
- _Z11no_overflowjj:
- addl %edi, %esi
- setae %al
- ret
-
-FIXME: That code looks wrong; bool return is normally defined as zext.
-
-on x86-64, not:
-
-__Z11no_overflowjj:
- addl %edi, %esi
- cmpl %edi, %esi
- setae %al
- movzbl %al, %eax
- ret
+on x86-64, instead of the rather stupid-looking:
+ addl %esi, %edi
+ setb %al
+ xorb $1, %al
+ movzbl %al, %eax
+ ret
//===---------------------------------------------------------------------===//
instead of:
_foo:
- movl $255, %eax
- orl 4(%esp), %eax
- andl $65535, %eax
- ret
+ movl $65280, %eax
+ andl 4(%esp), %eax
+ orl $255, %eax
+ ret
//===---------------------------------------------------------------------===//
Take the following:
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
target triple = "i386-apple-darwin8"
@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
define fastcc void @abort_gzip() noreturn nounwind {
processors. GCC does two optimizations:
1. ix86_pad_returns inserts a noop before ret instructions if immediately
- preceeded by a conditional branch or is the target of a jump.
+ preceded by a conditional branch or is the target of a jump.
2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
code contains more than 3 branches.
The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
Core 2, and "Generic"
-//===---------------------------------------------------------------------===//
-
-Testcase:
-int a(int x) { return (x & 127) > 31; }
-
-Current output:
- movl 4(%esp), %eax
- andl $127, %eax
- cmpl $31, %eax
- seta %al
- movzbl %al, %eax
- ret
-
-Ideal output:
- xorl %eax, %eax
- testl $96, 4(%esp)
- setne %al
- ret
-
-This should definitely be done in instcombine, canonicalizing the range
-condition into a != condition. We get this IR:
-
-define i32 @a(i32 %x) nounwind readnone {
-entry:
- %0 = and i32 %x, 127 ; <i32> [#uses=1]
- %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
- %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
- ret i32 %2
-}
-
-Instcombine prefers to strength reduce relational comparisons to equality
-comparisons when possible, this should be another case of that. This could
-be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
-looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
-be redesigned to use ComputeMaskedBits and friends.
-
-
//===---------------------------------------------------------------------===//
Testcase:
int x(int a) { return (a&0xf0)>>4; }
//===---------------------------------------------------------------------===//
-_Bool bar(int *x) { return *x & 1; }
+struct B {
+ unsigned char y0 : 1;
+};
-define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
-entry:
- %tmp1 = load i32* %x ; <i32> [#uses=1]
- %and = and i32 %tmp1, 1 ; <i32> [#uses=1]
- %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1]
- ret i1 %tobool
+int bar(struct B* a) { return a->y0; }
+
+define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
+ %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
+ %2 = load i8* %1, align 1
+ %3 = and i8 %2, 1
+ %4 = zext i8 %3 to i32
+ ret i32 %4
}
-bar: # @bar
-# BB#0: # %entry
- movl 4(%esp), %eax
- movb (%eax), %al
- andb $1, %al
- movzbl %al, %eax
- ret
+bar: # @bar
+# BB#0:
+ movb (%rdi), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
Missed optimization: should be movl+andl.
//===---------------------------------------------------------------------===//
+The x86_64 abi says:
+
+Booleans, when stored in a memory object, are stored as single byte objects the
+value of which is always 0 (false) or 1 (true).
+
+We are not using this fact:
+
+int bar(_Bool *a) { return *a; }
+
+define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
+ %1 = load i8* %a, align 1, !tbaa !0
+ %tmp = and i8 %1, 1
+ %2 = zext i8 %tmp to i32
+ ret i32 %2
+}
+
+bar:
+ movb (%rdi), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+GCC produces
+
+bar:
+ movzbl (%rdi), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
Consider the following two functions compiled with clang:
_Bool foo(int *x) { return !(*x & 4); }
unsigned bar(int *x) { return !(*x & 4); }
//===---------------------------------------------------------------------===//
-Take the following C code:
-int x(int y) { return (y & 63) << 14; }
-
-Code produced by gcc:
- andl $63, %edi
- sall $14, %edi
- movl %edi, %eax
- ret
-
-Code produced by clang:
- shll $14, %edi
- movl %edi, %eax
- andl $1032192, %eax
- ret
-
-The code produced by gcc is 3 bytes shorter. This sort of construct often
-shows up with bitfields.
-
-//===---------------------------------------------------------------------===//
-
Take the following C code:
int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
//===---------------------------------------------------------------------===//
-This:
-char t(char c) {
- return c/3;
+The hot loop of 256.bzip2 contains code that looks a bit like this:
+
+int foo(char *P, char *Q, int x, int y) {
+ if (P[0] != Q[0])
+ return P[0] < Q[0];
+ if (P[1] != Q[1])
+ return P[1] < Q[1];
+ if (P[2] != Q[2])
+ return P[2] < Q[2];
+ return P[3] < Q[3];
}
-Compiles to: $clang t.c -S -o - -O3 -mkernel -fomit-frame-pointer
+In the real code, we get a lot more wrong than this. However, even in this
+code we generate:
-_t: ## @t
- movslq %edi, %rax
- imulq $-1431655765, %rax, %rcx ## imm = 0xFFFFFFFFAAAAAAAB
- shrq $32, %rcx
- addl %ecx, %eax
- movl %eax, %ecx
- shrl $31, %ecx
- shrl %eax
- addl %ecx, %eax
- movsbl %al, %eax
+_foo: ## @foo
+## BB#0: ## %entry
+ movb (%rsi), %al
+ movb (%rdi), %cl
+ cmpb %al, %cl
+ je LBB0_2
+LBB0_1: ## %if.then
+ cmpb %al, %cl
+ jmp LBB0_5
+LBB0_2: ## %if.end
+ movb 1(%rsi), %al
+ movb 1(%rdi), %cl
+ cmpb %al, %cl
+ jne LBB0_1
+## BB#3: ## %if.end38
+ movb 2(%rsi), %al
+ movb 2(%rdi), %cl
+ cmpb %al, %cl
+ jne LBB0_1
+## BB#4: ## %if.end60
+ movb 3(%rdi), %al
+ cmpb 3(%rsi), %al
+LBB0_5: ## %if.end60
+ setl %al
+ movzbl %al, %eax
ret
-GCC gets:
+Note that we generate jumps to LBB0_1 which does a redundant compare. The
+redundant compare also forces the register values to be live, which prevents
+folding one of the loads into the compare. In contrast, GCC 4.2 produces:
-_t:
- movl $86, %eax
- imulb %dil
- shrw $8, %ax
- sarb $7, %dil
- subb %dil, %al
- movsbl %al,%eax
+_foo:
+ movzbl (%rsi), %eax
+ cmpb %al, (%rdi)
+ jne L10
+L12:
+ movzbl 1(%rsi), %eax
+ cmpb %al, 1(%rdi)
+ jne L10
+ movzbl 2(%rsi), %eax
+ cmpb %al, 2(%rdi)
+ jne L10
+ movzbl 3(%rdi), %eax
+ cmpb 3(%rsi), %al
+L10:
+ setl %al
+ movzbl %al, %eax
+ ret
+
+which is "perfect".
+
+//===---------------------------------------------------------------------===//
+
+For the branch in the following code:
+int a();
+int b(int x, int y) {
+ if (x & (1<<(y&7)))
+ return a();
+ return y;
+}
+
+We currently generate:
+ movb %sil, %al
+ andb $7, %al
+ movzbl %al, %eax
+ btl %eax, %edi
+ jae .LBB0_2
+
+movl+andl would be shorter than the movb+andb+movzbl sequence.
+
+//===---------------------------------------------------------------------===//
+
+For the following:
+struct u1 {
+ float x, y;
+};
+float foo(struct u1 u) {
+ return u.x + u.y;
+}
+
+We currently generate:
+ movdqa %xmm0, %xmm1
+ pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
+ addss %xmm1, %xmm0
ret
-which is nicer. This also happens for int, not just char.
+We could save an instruction here by commuting the addss.
//===---------------------------------------------------------------------===//
+This (from PR9661):
+float clamp_float(float a) {
+ if (a > 1.0f)
+ return 1.0f;
+ else if (a < 0.0f)
+ return 0.0f;
+ else
+ return a;
+}
+Could compile to:
+
+clamp_float: # @clamp_float
+ movss .LCPI0_0(%rip), %xmm1
+ minss %xmm1, %xmm0
+ pxor %xmm1, %xmm1
+ maxss %xmm1, %xmm0
+ ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+ if (a > 5)
+ a = 5;
+ if (a < 0)
+ return 0;
+ return a;
+}
+
+Compiles to:
+
+_clamp2: ## @clamp2
+ pushq %rbp
+ movq %rsp, %rbp
+ cmpl $5, %edi
+ movl $5, %ecx
+ cmovlel %edi, %ecx
+ testl %ecx, %ecx
+ movl $0, %eax
+ cmovnsl %ecx, %eax
+ popq %rbp
+ ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR48986. We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+ if (__sync_fetch_and_add(p, -1) == 1)
+ bar();
+}
+
+into:
+ movl $-1, %eax
+ lock
+ xaddl %eax, (%rdi)
+ cmpl $1, %eax
+ je LBB0_2
+
+Instead we could generate:
+
+ lock
+ dec %rdi
+ je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
+unsigned t(unsigned a, unsigned b) {
+ return a <= b ? 5 : -5;
+}
+
+We generate:
+ movl $5, %ecx
+ cmpl %esi, %edi
+ movl $-5, %eax
+ cmovbel %ecx, %eax
+
+GCC:
+ cmpl %edi, %esi
+ sbbl %eax, %eax
+ andl $-10, %eax
+ addl $5, %eax
+
+//===---------------------------------------------------------------------===//