//===---------------------------------------------------------------------===//
-CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
-backend knows how to three-addressify this shift, but it appears the register
-allocator isn't even asking it to do so in this case. We should investigate
-why this isn't happening, it could have significant impact on other important
-cases for X86 as well.
-
-//===---------------------------------------------------------------------===//
-
This should be one DIV/IDIV instruction, not a libcall:
unsigned test(unsigned long long X, unsigned Y) {
processors. GCC does two optimizations:
1. ix86_pad_returns inserts a noop before ret instructions if immediately
- preceeded by a conditional branch or is the target of a jump.
+ preceded by a conditional branch or is the target of a jump.
2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
code contains more than 3 branches.
//===---------------------------------------------------------------------===//
+The x86_64 abi says:
+
+Booleans, when stored in a memory object, are stored as single byte objects the
+value of which is always 0 (false) or 1 (true).
+
+We are not using this fact:
+
+int bar(_Bool *a) { return *a; }
+
+define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
+ %1 = load i8* %a, align 1, !tbaa !0
+ %tmp = and i8 %1, 1
+ %2 = zext i8 %tmp to i32
+ ret i32 %2
+}
+
+bar:
+ movb (%rdi), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+GCC produces
+
+bar:
+ movzbl (%rdi), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
Consider the following two functions compiled with clang:
_Bool foo(int *x) { return !(*x & 4); }
unsigned bar(int *x) { return !(*x & 4); }
//===---------------------------------------------------------------------===//
-Take the following C code:
-int x(int y) { return (y & 63) << 14; }
-
-Code produced by gcc:
- andl $63, %edi
- sall $14, %edi
- movl %edi, %eax
- ret
-
-Code produced by clang:
- shll $14, %edi
- movl %edi, %eax
- andl $1032192, %eax
- ret
-
-The code produced by gcc is 3 bytes shorter. This sort of construct often
-shows up with bitfields.
-
-//===---------------------------------------------------------------------===//
-
Take the following C code:
int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
We could save an instruction here by commuting the addss.
//===---------------------------------------------------------------------===//
+
+This (from PR9661):
+
+float clamp_float(float a) {
+ if (a > 1.0f)
+ return 1.0f;
+ else if (a < 0.0f)
+ return 0.0f;
+ else
+ return a;
+}
+
+Could compile to:
+
+clamp_float: # @clamp_float
+ movss .LCPI0_0(%rip), %xmm1
+ minss %xmm1, %xmm0
+ pxor %xmm1, %xmm1
+ maxss %xmm1, %xmm0
+ ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+ if (a > 5)
+ a = 5;
+ if (a < 0)
+ return 0;
+ return a;
+}
+
+Compiles to:
+
+_clamp2: ## @clamp2
+ pushq %rbp
+ movq %rsp, %rbp
+ cmpl $5, %edi
+ movl $5, %ecx
+ cmovlel %edi, %ecx
+ testl %ecx, %ecx
+ movl $0, %eax
+ cmovnsl %ecx, %eax
+ popq %rbp
+ ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR48986. We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+ if (__sync_fetch_and_add(p, -1) == 1)
+ bar();
+}
+
+into:
+ movl $-1, %eax
+ lock
+ xaddl %eax, (%rdi)
+ cmpl $1, %eax
+ je LBB0_2
+
+Instead we could generate:
+
+ lock
+ dec %rdi
+ je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
+unsigned log2(unsigned x) {
+ return x > 1 ? 32-__builtin_clz(x-1) : 0;
+}
+
+generates (x86_64):
+ xorl %eax, %eax
+ cmpl $2, %edi
+ jb LBB0_2
+## BB#1:
+ decl %edi
+ movl $63, %ecx
+ bsrl %edi, %eax
+ cmovel %ecx, %eax
+ xorl $-32, %eax
+ addl $33, %eax
+LBB0_2:
+ ret
+
+The cmov and the early test are redundant:
+ xorl %eax, %eax
+ cmpl $2, %edi
+ jb LBB0_2
+## BB#1:
+ decl %edi
+ bsrl %edi, %eax
+ xorl $-32, %eax
+ addl $33, %eax
+LBB0_2:
+ ret
+
+//===---------------------------------------------------------------------===//