// Random ideas for the X86 backend.
//===---------------------------------------------------------------------===//
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?) This is available on Atom processors.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-%X = weak global int 0
-
-void %foo(int %N) {
- %N = cast int %N to uint
- %tmp.24 = setgt int %N, 0
- br bool %tmp.24, label %no_exit, label %return
-
-no_exit:
- %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
- %i.0.0 = cast uint %indvar to int
- volatile store int %i.0.0, int* %X
- %indvar.next = add uint %indvar, 1
- %exitcond = seteq uint %indvar.next, %N
- br bool %exitcond, label %return, label %no_exit
-
-return:
- ret void
-}
-
-compiles into:
-
- .text
- .align 4
- .globl _foo
-_foo:
- movl 4(%esp), %eax
- cmpl $1, %eax
- jl LBB_foo_4 # return
-LBB_foo_1: # no_exit.preheader
- xorl %ecx, %ecx
-LBB_foo_2: # no_exit
- movl L_X$non_lazy_ptr, %edx
- movl %ecx, (%edx)
- incl %ecx
- cmpl %eax, %ecx
- jne LBB_foo_2 # no_exit
-LBB_foo_3: # return.loopexit
-LBB_foo_4: # return
- ret
-
-We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
-remateralization is implemented. This can be accomplished with 1) a target
-dependent LICM pass or 2) makeing SelectDAG represent the whole function.
-
-//===---------------------------------------------------------------------===//
-
The following tests perform worse with LSR:
lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
//===---------------------------------------------------------------------===//
-We are generating far worse code than gcc:
-
-volatile short X, Y;
-
-void foo(int N) {
- int i;
- for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-LBB1_1: # entry.bb_crit_edge
- xorl %ecx, %ecx
- xorw %dx, %dx
-LBB1_2: # bb
- movl L_X$non_lazy_ptr, %esi
- movw %cx, (%esi)
- movl L_Y$non_lazy_ptr, %esi
- movw %dx, (%esi)
- addw $4, %dx
- incl %ecx
- cmpl %eax, %ecx
- jne LBB1_2 # bb
-
-vs.
-
- xorl %edx, %edx
- movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
- movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
-L4:
- movw %dx, (%esi)
- leal 0(,%edx,4), %eax
- movw %ax, (%ecx)
- addl $1, %edx
- cmpl %edx, %edi
- jne L4
-
-This is due to the lack of post regalloc LICM.
-
-//===---------------------------------------------------------------------===//
-
Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
FR64 to VR128.
//===---------------------------------------------------------------------===//
-Currently we don't have elimination of redundant stack manipulations. Consider
-the code:
-
-int %main() {
-entry:
- call fastcc void %test1( )
- call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
- ret int 0
-}
-
-declare fastcc void %test1()
-
-declare fastcc void %test2(sbyte*)
-
-
-This currently compiles to:
-
- subl $16, %esp
- call _test5
- addl $12, %esp
- subl $16, %esp
- movl $_test5, (%esp)
- call _test6
- addl $12, %esp
-
-The add\sub pair is really unneeded here.
-
-//===---------------------------------------------------------------------===//
-
Consider the expansion of:
define i32 @test3(i32 %X) {
//===---------------------------------------------------------------------===//
-We aren't matching RMW instructions aggressively
-enough. Here's a reduced testcase (more in PR1160):
-
-define void @test(i32* %huge_ptr, i32* %target_ptr) {
- %A = load i32* %huge_ptr ; <i32> [#uses=1]
- %B = load i32* %target_ptr ; <i32> [#uses=1]
- %C = or i32 %A, %B ; <i32> [#uses=1]
- store i32 %C, i32* %target_ptr
- ret void
-}
-
-$ llvm-as < t.ll | llc -march=x86-64
-
-_test:
- movl (%rdi), %eax
- orl (%rsi), %eax
- movl %eax, (%rsi)
- ret
-
-That should be something like:
-
-_test:
- movl (%rdi), %eax
- orl %eax, (%rsi)
- ret
-
-//===---------------------------------------------------------------------===//
-
The following code:
bb114.preheader: ; preds = %cond_next94
A similar code sequence works for division.
//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+ return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+ return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %sum = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %normal
+normal:
+ ret i32 %sum
+overflow:
+ call void @llvm.trap()
+ unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+ movl 4(%esp), %eax
+ addl 8(%esp), %eax
+ jo LBB1_2 ## overflow
+LBB1_1: ## normal
+ ret
+LBB1_2: ## overflow
+ ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+ movl %ebx, %eax
+ imull (%edi,%ecx,4)
+ shrdl $31, %edx, %eax
+ addl %eax, (%esi,%ecx,4)
+ incl %ecx
+ cmpl $149, %ecx
+ jle .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1: ## bb1
+ movl 24(%esp), %eax
+ movl (%eax,%edi,4), %ebx
+ movl %ebx, %ebp
+ imull %esi, %ebp
+ movl %ebx, %eax
+ mull %ecx
+ addl %ebp, %edx
+ sarl $31, %ebx
+ imull %ecx, %ebx
+ addl %edx, %ebx
+ shldl $1, %eax, %ebx
+ movl 20(%esp), %eax
+ addl %ebx, (%eax,%edi,4)
+ incl %edi
+ cmpl $150, %edi
+ jne LBB1_1 ## bb1
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible. A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+ %tmp = alloca <16 x float>, align 16
+ %tmp2 = alloca <16 x float>, align 16
+ store <16 x float> %A, <16 x float>* %tmp
+ %s = bitcast <16 x float>* %tmp to i8*
+ %s2 = bitcast <16 x float>* %tmp2 to i8*
+ call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+ %R = load <16 x float>* %tmp2
+ ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+ subl $140, %esp
+ movaps %xmm3, 112(%esp)
+ movaps %xmm2, 96(%esp)
+ movaps %xmm1, 80(%esp)
+ movaps %xmm0, 64(%esp)
+ movl 60(%esp), %eax
+ movl %eax, 124(%esp)
+ movl 56(%esp), %eax
+ movl %eax, 120(%esp)
+ movl 52(%esp), %eax
+ <many many more 32-bit copies>
+ movaps (%esp), %xmm0
+ movaps 16(%esp), %xmm1
+ movaps 32(%esp), %xmm2
+ movaps 48(%esp), %xmm3
+ addl $140, %esp
+ ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors. GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+ preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+ code contains more than 3 branches.
+
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+ Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+ movl 4(%esp), %eax
+ andl $127, %eax
+ cmpl $31, %eax
+ seta %al
+ movzbl %al, %eax
+ ret
+
+Ideal output:
+ xorl %eax, %eax
+ testl $96, 4(%esp)
+ setne %al
+ ret
+
+This should definitely be done in instcombine, canonicalizing the range
+condition into a != condition. We get this IR:
+
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+ %0 = and i32 %x, 127 ; <i32> [#uses=1]
+ %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
+ %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
+ ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that. This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+ movl 4(%esp), %eax
+ shrl $4, %eax
+ andl $15, %eax
+ ret
+
+Ideal output:
+ movzbl 4(%esp), %eax
+ shrl $4, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+ testl $128, 4(%esp)
+ setne %al
+ movzbl %al, %eax
+ shll $8, %eax
+ ret
+
+Better:
+ movl 4(%esp), %eax
+ addl %eax, %eax
+ andl $256, %eax
+ ret
+
+This is another general instcombine transformation that is profitable on all
+targets. In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+ %0 = and i32 %a, 128
+ %1 = icmp eq i32 %0, 0
+ %iftmp.0.0 = select i1 %1, i32 0, i32 256
+ ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+ %0 = shl i32 %a, 1
+ %1 = and i32 %0, 256
+ ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//