// Random ideas for the X86 backend.
//===---------------------------------------------------------------------===//
-Missing features:
- - Support for SSE4: http://www.intel.com/software/penryn
-http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
- - support for 3DNow!
- - weird abis?
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-mov $reg, 48(%esp)
-...
-leal 48(%esp), %eax
-mov %eax, (%esp)
-call _foo
-
-Obviously it would have been better for the first mov (or any op) to store
-directly %esp[0] if there are no other uses.
-
-//===---------------------------------------------------------------------===//
-
Adding to the list of cmp / test poor codegen issues:
int test(__m128 *A, __m128 *B) {
//===---------------------------------------------------------------------===//
-This is a "commutable two-address" register coallescing deficiency:
-
-define <4 x float> @test1(<4 x float> %V) {
-entry:
- %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
- <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
- %add = add <4 x float> %tmp8, %V
- ret <4 x float> %add
-}
-
-this codegens to:
-
-_test1:
- pshufd $27, %xmm0, %xmm1
- addps %xmm0, %xmm1
- movaps %xmm1, %xmm0
- ret
-
-instead of:
-
-_test1:
- pshufd $27, %xmm0, %xmm1
- addps %xmm1, %xmm0
- ret
-
-//===---------------------------------------------------------------------===//
-
Leaf functions that require one 4-byte spill slot have a prolog like this:
_foo:
//===---------------------------------------------------------------------===//
-For this code:
-
-cond_next603: ; preds = %bb493, %cond_true336, %cond_next599
- %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ] ; <i32> [#uses=1]
- %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ] ; <i32> [#uses=2]
- %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ] ; <i32> [#uses=2]
- %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ] ; <i8*> [#uses=9]
- %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ] ; <i32> [#uses=4]
- %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ] ; <i32> [#uses=7]
- %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1 ; <i8*> [#uses=1]
- %tmp607 = load i8* %tmp606, align 1 ; <i8> [#uses=1]
-
-We produce this:
-
-LBB4_70: # cond_next603
- movl -20(%ebp), %esi
- movl L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
-
-However, ICC caches this information before the loop and produces this:
-
- movl 88(%esp), %eax #481.12
-
-//===---------------------------------------------------------------------===//
-
This code:
%tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
//===---------------------------------------------------------------------===//
-Take the following code:
+We compile this function:
-#include <xmmintrin.h>
-__m128i doload64(short x) {return _mm_set_epi16(x,x,x,x,x,x,x,x);}
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
+entry:
+ %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
+ br i1 %tmp2, label %bb7, label %bb
-LLVM currently generates the following on x86:
-doload64:
- movzwl 4(%esp), %eax
- movd %eax, %xmm0
- punpcklwd %xmm0, %xmm0
- pshufd $0, %xmm0, %xmm0
- ret
+bb: ; preds = %entry
+ %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
+ ret i32 %tmp6
-gcc's generated code:
-doload64:
- movd 4(%esp), %xmm0
- punpcklwd %xmm0, %xmm0
- pshufd $0, %xmm0, %xmm0
- ret
+bb7: ; preds = %entry
+ %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
+ ret i32 %tmp10
+}
-LLVM should be able to generate the same thing as gcc.
+to:
+
+_foo:
+ cmpb $0, 16(%esp)
+ movl 12(%esp), %ecx
+ movl 8(%esp), %eax
+ movl 4(%esp), %edx
+ je LBB1_2 # bb7
+LBB1_1: # bb
+ addl %edx, %eax
+ ret
+LBB1_2: # bb7
+ movl %edx, %eax
+ subl %ecx, %eax
+ ret
+
+The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
+if it commuted the addl in LBB1_1.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15: # bb310
+ cvtss2sd LCPI1_0, %xmm1
+ addsd %xmm1, %xmm0
+ movsd 176(%esp), %xmm2
+ mulsd %xmm0, %xmm2
+ movapd %xmm2, %xmm3
+ mulsd %xmm3, %xmm3
+ movapd %xmm3, %xmm4
+ mulsd LCPI1_23, %xmm4
+ addsd LCPI1_24, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_25, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_26, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_27, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_28, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd %xmm1, %xmm4
+ mulsd %xmm2, %xmm4
+ movsd 152(%esp), %xmm1
+ addsd %xmm4, %xmm1
+ movsd %xmm1, 152(%esp)
+ incl %eax
+ cmpl %eax, %esi
+ jge LBB1_15 # bb310
+LBB1_16: # bb358.loopexit
+ movsd 152(%esp), %xmm0
+ addsd %xmm0, %xmm0
+ addsd LCPI1_22, %xmm0
+ movsd %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
//===---------------------------------------------------------------------===//
-Take the following code:
-#include <xmmintrin.h>
-__m128i doload64(short x) {return _mm_set_epi16(0,0,0,0,0,0,0,1);}
+Legalize loses track of the fact that bools are always zero extended when in
+memory. This causes us to compile abort_gzip (from 164.gzip) from:
-On x86, LLVM generates the following:
-doload64:
- subl $28, %esp
- movl $0, 4(%esp)
- movl $1, (%esp)
- movq (%esp), %xmm0
- addl $28, %esp
- ret
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind {
+entry:
+ %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
+ br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i: ; preds = %entry
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+bb4.i: ; preds = %entry
+ store i1 true, i1* @in_exit.4870.b
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+}
+declare void @exit(i32) noreturn nounwind
-LLVM should instead generate something more like the following:
-doload64:
- movl $1, %eax
- movd %eax, %xmm0
- ret
+into:
+
+_abort_gzip:
+ subl $12, %esp
+ movb _in_exit.4870.b, %al
+ notb %al
+ testb $1, %al
+ jne LBB1_2 ## bb4.i
+LBB1_1: ## bb.i
+ ...
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+ return x-y-1;
+}
+
+into (-m64):
+
+_test:
+ decl %edi
+ movl %edi, %eax
+ subl %esi, %eax
+ ret
+
+it would be better to codegen as: x+~y (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+We should consider using __i686.get_pc_thunk.bx for MOVPC32r (used for PIC)
+on targets that support it, such as Linux and similar targets, in place of
+the call-a-label trick. It's said to be friendlier to branch-prediction
+hardware because it pairs a ret with the call.
//===---------------------------------------------------------------------===//