//===---------------------------------------------------------------------===//
-If shorter, we should use things like:
-movzwl %ax, %eax
-instead of:
-andl $65535, %EAX
-
-The former can also be used when the two-addressy nature of the 'and' would
-require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
-
-//===---------------------------------------------------------------------===//
-
-Another instruction selector deficiency:
-
-void %bar() {
- %tmp = load int (int)** %foo
- %tmp = tail call int %tmp( int 3 )
- ret void
-}
-
-_bar:
- subl $12, %esp
- movl L_foo$non_lazy_ptr, %eax
- movl (%eax), %eax
- call *%eax
- addl $12, %esp
- ret
-
-The current isel scheme will not allow the load to be folded in the call since
-the load's chain result is read by the callseq_start.
-
-//===---------------------------------------------------------------------===//
-
For this:
int test(int a)
However, if we care more about code size, then imull is better. It's two bytes
shorter than movl + leal.
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
//===---------------------------------------------------------------------===//
__builtin_ffs codegen is messy.
to grab the bytes from the next cacheline.
532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
-942 942 0x3d03 movl %dh, (1809(%esp, %esi)
-937 937 0x3d0a incl %esi
-3 3 0x3d0b cmpb %bl, %dl
+942 942 0x3d03 movl %dh, (1809(%esp, %esi)
+937 937 0x3d0a incl %esi
+3 3 0x3d0b cmpb %bl, %dl
27 27 0x3d0d jnz 0x000062db <main+11707>
//===---------------------------------------------------------------------===//
ret
it would be better to codegen as: x+~y (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+ subq $200, %rsp
+ movaps %xmm7, 160(%rsp)
+ movaps %xmm6, 144(%rsp)
+ movaps %xmm5, 128(%rsp)
+ movaps %xmm4, 112(%rsp)
+ movaps %xmm3, 96(%rsp)
+ movaps %xmm2, 80(%rsp)
+ movaps %xmm1, 64(%rsp)
+ movaps %xmm0, 48(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r8, 32(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 8(%rsp)
+ leaq (%rsp), %rax
+ movq %rax, 192(%rsp)
+ leaq 208(%rsp), %rax
+ movq %rax, 184(%rsp)
+ movl $48, 180(%rsp)
+ movl $8, 176(%rsp)
+ movl 176(%rsp), %eax
+ cmpl $47, %eax
+ jbe .LBB1_3 # bb
+.LBB1_1: # bb3
+ movq 184(%rsp), %rcx
+ leaq 8(%rcx), %rax
+ movq %rax, 184(%rsp)
+.LBB1_2: # bb4
+ movl (%rcx), %eax
+ addq $200, %rsp
+ ret
+.LBB1_3: # bb
+ movl %eax, %ecx
+ addl $8, %eax
+ addq 192(%rsp), %rcx
+ movl %eax, 176(%rsp)
+ jmp .LBB1_2 # bb4
+
+gcc 4.3 generates:
+ subq $96, %rsp
+.LCFI0:
+ leaq 104(%rsp), %rax
+ movq %rsi, -80(%rsp)
+ movl $8, -120(%rsp)
+ movq %rax, -112(%rsp)
+ leaq -88(%rsp), %rax
+ movq %rax, -104(%rsp)
+ movl $8, %eax
+ cmpl $48, %eax
+ jb .L6
+ movq -112(%rsp), %rdx
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L6:
+ mov %eax, %edx
+ addq -104(%rsp), %rdx
+ addl $8, %eax
+ movl %eax, -120(%rsp)
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+
+and it gets compiled into this on x86:
+ pushl %ebp
+ movl %esp, %ebp
+ subl $4, %esp
+ leal 12(%ebp), %eax
+ movl %eax, -4(%ebp)
+ leal 16(%ebp), %eax
+ movl %eax, -4(%ebp)
+ movl 12(%ebp), %eax
+ addl $4, %esp
+ popl %ebp
+ ret
+
+gcc 4.3 generates:
+ pushl %ebp
+ movl %esp, %ebp
+ movl 12(%ebp), %eax
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall. The expected code is something
+like the following:
+
+ movl 12(%ebp), %eax
+ movl 16(%ebp), %ecx
+ xorl %edx, %edx
+ divl %ecx
+ movl 8(%ebp), %eax
+ divl %ecx
+ movl %edx, %eax
+ ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+ return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+ return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %sum = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %normal
+normal:
+ ret i32 %sum
+overflow:
+ call void @llvm.trap()
+ unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+ movl 4(%esp), %eax
+ addl 8(%esp), %eax
+ jo LBB1_2 ## overflow
+LBB1_1: ## normal
+ ret
+LBB1_2: ## overflow
+ ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//