//===---------------------------------------------------------------------===//
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
-
-//===---------------------------------------------------------------------===//
-
Adding to the list of cmp / test poor codegen issues:
int test(__m128 *A, __m128 *B) {
//===---------------------------------------------------------------------===//
+Linux is missing some basic tail call support:
+
+#include <math.h>
+double foo(double a) { return sin(a); }
+
+This compiles into this on x86-64 Linux (but not darwin):
+foo:
+ subq $8, %rsp
+ call sin
+ addq $8, %rsp
+ ret
+vs:
+
+foo:
+ jmp sin
+
+//===---------------------------------------------------------------------===//
+
Tail call optimization improvements: Tail call optimization currently
pushes all arguments on the top of the stack (their normal place for
non-tail call optimized calls) that source from the callers arguments
//===---------------------------------------------------------------------===//
-This testcase misses a read/modify/write opportunity (from PR1425):
-
-void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
- int i;
- for(i=0; i<width; i++)
- b1[i] += (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We compile it down to:
-
-LBB1_2: # bb
- movl (%esi,%edi,4), %ebx
- addl (%ecx,%edi,4), %ebx
- addl (%edx,%edi,4), %ebx
- movl %ebx, (%ecx,%edi,4)
- incl %edi
- cmpl %eax, %edi
- jne LBB1_2 # bb
-
-the inner loop should add to the memory location (%ecx,%edi,4), saving
-a mov. Something like:
-
- movl (%esi,%edi,4), %ebx
- addl (%edx,%edi,4), %ebx
- addl %ebx, (%ecx,%edi,4)
-
-Here is another interesting example:
-
-void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
- int i;
- for(i=0; i<width; i++)
- b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
-
-LBB9_2: # bb
- movl (%ecx,%edi,4), %ebx
- subl (%esi,%edi,4), %ebx
- subl (%edx,%edi,4), %ebx
- movl %ebx, (%ecx,%edi,4)
- incl %edi
- cmpl %eax, %edi
- jne LBB9_2 # bb
-
-Additionally, LSR should rewrite the exit condition of these loops to use
-a stride-4 IV, would would allow all the scales in the loop to go away.
-This would result in smaller code and more efficient microops.
-
-//===---------------------------------------------------------------------===//
-
In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
or and instruction, for example:
//===---------------------------------------------------------------------===//
-handling llvm.memory.barrier on pre SSE2 cpus
-
-should generate:
-lock ; mov %esp, %esp
-
-//===---------------------------------------------------------------------===//
-
The generated code on x86 for checking for signed overflow on a multiply the
obvious way is much longer than it needs to be.
xorl %eax, %eax
ret
-There are a few possible improvements here:
-1. We should be able to eliminate the dead load into %ecx
-2. We could change the "movl 8(%esp), %eax" into
- "movzwl 10(%esp), %eax"; this lets us change the cmpl
- into a testl, which is shorter, and eliminate the shift.
-
-We could also in theory eliminate the branch by using a conditional
-for the address of the load, but that seems unlikely to be worthwhile
-in general.
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
//===---------------------------------------------------------------------===//
to:
-_foo:
+foo: # @foo
+# BB#0: # %entry
+ movl 4(%esp), %ecx
cmpb $0, 16(%esp)
- movl 12(%esp), %ecx
+ je .LBB0_2
+# BB#1: # %bb
movl 8(%esp), %eax
- movl 4(%esp), %edx
- je LBB1_2 # bb7
-LBB1_1: # bb
- addl %edx, %eax
+ addl %ecx, %eax
ret
-LBB1_2: # bb7
- movl %edx, %eax
- subl %ecx, %eax
+.LBB0_2: # %bb7
+ movl 12(%esp), %edx
+ movl %ecx, %eax
+ subl %edx, %eax
ret
-The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
-if it commuted the addl in LBB1_1.
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Legalize loses track of the fact that bools are always zero extended when in
-memory. This causes us to compile abort_gzip (from 164.gzip) from:
+Take the following:
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin8"
}
declare void @exit(i32) noreturn nounwind
-into:
-
-_abort_gzip:
+This compiles into:
+_abort_gzip: ## @abort_gzip
+## BB#0: ## %entry
subl $12, %esp
movb _in_exit.4870.b, %al
- notb %al
- testb $1, %al
- jne LBB1_2 ## bb4.i
-LBB1_1: ## bb.i
- ...
+ cmpb $1, %al
+ jne LBB0_2
+
+We somehow miss folding the movb into the cmpb.
//===---------------------------------------------------------------------===//
shows up with bitfields.
//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = xor i32 %b, %a ; <i32> [#uses=1]
+ %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ xorl %esi, %edi
+ testb $-1, %dil
+ sete %al
+ movzbl %al, %eax
+ ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %sext = shl i32 %a, 24 ; <i32> [#uses=1]
+ %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
+ %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
+ %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ movsbl %sil, %eax
+ movsbl %dil, %ecx
+ cmpl %eax, %ecx
+ sete %al
+ movzbl %al, %eax
+ ret
+
+
+It should be possible to eliminate the sign extensions.
+
+//===---------------------------------------------------------------------===//
+
+LLVM misses a load+store narrowing opportunity in this code:
+
+%struct.bf = type { i64, i16, i16, i32 }
+
+@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
+
+define void @t1() nounwind ssp {
+entry:
+ %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
+ %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
+ %3 = load i32* %2, align 1 ; <i32> [#uses=1]
+ %4 = and i32 %3, -65537 ; <i32> [#uses=1]
+ store i32 %4, i32* %2, align 1
+ %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
+ %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
+ %8 = load i32* %7, align 1 ; <i32> [#uses=1]
+ %9 = and i32 %8, -131073 ; <i32> [#uses=1]
+ store i32 %9, i32* %7, align 1
+ ret void
+}
+
+LLVM currently emits this:
+
+ movq bfi(%rip), %rax
+ andl $-65537, 8(%rax)
+ movq bfi(%rip), %rax
+ andl $-131073, 8(%rax)
+ ret
+
+It could narrow the loads and stores to emit this:
+
+ movq bfi(%rip), %rax
+ andb $-2, 10(%rax)
+ movq bfi(%rip), %rax
+ andb $-3, 10(%rax)
+ ret
+
+The trouble is that there is a TokenFactor between the store and the
+load, making it non-trivial to determine if there's anything between
+the load and the store which would prohibit narrowing.
+
+//===---------------------------------------------------------------------===//