+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+ movl %ebx, %eax
+ imull (%edi,%ecx,4)
+ shrdl $31, %edx, %eax
+ addl %eax, (%esi,%ecx,4)
+ incl %ecx
+ cmpl $149, %ecx
+ jle .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1: ## bb1
+ movl 24(%esp), %eax
+ movl (%eax,%edi,4), %ebx
+ movl %ebx, %ebp
+ imull %esi, %ebp
+ movl %ebx, %eax
+ mull %ecx
+ addl %ebp, %edx
+ sarl $31, %ebx
+ imull %ecx, %ebx
+ addl %edx, %ebx
+ shldl $1, %eax, %ebx
+ movl 20(%esp), %eax
+ addl %ebx, (%eax,%edi,4)
+ incl %edi
+ cmpl $150, %edi
+ jne LBB1_1 ## bb1
+
+The issue is that we hoist the cast of "scaler" to long long outside of the
+loop, the value comes into the loop as two values, and
+RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
+constructed BUILD_PAIR which represents the cast value.
+
+This can be handled by making CodeGenPrepare sink the cast.
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible. A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+ %tmp = alloca <16 x float>, align 16
+ %tmp2 = alloca <16 x float>, align 16
+ store <16 x float> %A, <16 x float>* %tmp
+ %s = bitcast <16 x float>* %tmp to i8*
+ %s2 = bitcast <16 x float>* %tmp2 to i8*
+ call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+ %R = load <16 x float>* %tmp2
+ ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+ subl $140, %esp
+ movaps %xmm3, 112(%esp)
+ movaps %xmm2, 96(%esp)
+ movaps %xmm1, 80(%esp)
+ movaps %xmm0, 64(%esp)
+ movl 60(%esp), %eax
+ movl %eax, 124(%esp)
+ movl 56(%esp), %eax
+ movl %eax, 120(%esp)
+ movl 52(%esp), %eax
+ <many many more 32-bit copies>
+ movaps (%esp), %xmm0
+ movaps 16(%esp), %xmm1
+ movaps 32(%esp), %xmm2
+ movaps 48(%esp), %xmm3
+ addl $140, %esp
+ ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors. GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+ preceded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+ code contains more than 3 branches.
+
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+ Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+ movl 4(%esp), %eax
+ shrl $4, %eax
+ andl $15, %eax
+ ret
+
+Ideal output:
+ movzbl 4(%esp), %eax
+ shrl $4, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//
+
+struct B {
+ unsigned char y0 : 1;
+};
+
+int bar(struct B* a) { return a->y0; }
+
+define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
+ %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
+ %2 = load i8* %1, align 1
+ %3 = and i8 %2, 1
+ %4 = zext i8 %3 to i32
+ ret i32 %4
+}
+
+bar: # @bar
+# BB#0:
+ movb (%rdi), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+Missed optimization: should be movl+andl.
+
+//===---------------------------------------------------------------------===//
+
+The x86_64 abi says:
+
+Booleans, when stored in a memory object, are stored as single byte objects the
+value of which is always 0 (false) or 1 (true).
+
+We are not using this fact:
+
+int bar(_Bool *a) { return *a; }
+
+define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
+ %1 = load i8* %a, align 1, !tbaa !0
+ %tmp = and i8 %1, 1
+ %2 = zext i8 %tmp to i32
+ ret i32 %2
+}
+
+bar:
+ movb (%rdi), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+GCC produces
+
+bar:
+ movzbl (%rdi), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider the following two functions compiled with clang:
+_Bool foo(int *x) { return !(*x & 4); }
+unsigned bar(int *x) { return !(*x & 4); }
+
+foo:
+ movl 4(%esp), %eax
+ testb $4, (%eax)
+ sete %al
+ movzbl %al, %eax
+ ret
+
+bar:
+ movl 4(%esp), %eax
+ movl (%eax), %eax
+ shrl $2, %eax
+ andl $1, %eax
+ xorl $1, %eax
+ ret
+
+The second function generates more code even though the two functions are
+are functionally identical.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = xor i32 %b, %a ; <i32> [#uses=1]
+ %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ xorl %esi, %edi
+ testb $-1, %dil
+ sete %al
+ movzbl %al, %eax
+ ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %sext = shl i32 %a, 24 ; <i32> [#uses=1]
+ %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
+ %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
+ %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ movsbl %sil, %eax
+ movsbl %dil, %ecx
+ cmpl %eax, %ecx
+ sete %al
+ movzbl %al, %eax
+ ret
+
+
+It should be possible to eliminate the sign extensions.
+
+//===---------------------------------------------------------------------===//
+
+LLVM misses a load+store narrowing opportunity in this code:
+
+%struct.bf = type { i64, i16, i16, i32 }
+
+@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
+
+define void @t1() nounwind ssp {
+entry:
+ %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
+ %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
+ %3 = load i32* %2, align 1 ; <i32> [#uses=1]
+ %4 = and i32 %3, -65537 ; <i32> [#uses=1]
+ store i32 %4, i32* %2, align 1
+ %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
+ %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
+ %8 = load i32* %7, align 1 ; <i32> [#uses=1]
+ %9 = and i32 %8, -131073 ; <i32> [#uses=1]
+ store i32 %9, i32* %7, align 1
+ ret void
+}
+
+LLVM currently emits this:
+
+ movq bfi(%rip), %rax
+ andl $-65537, 8(%rax)
+ movq bfi(%rip), %rax
+ andl $-131073, 8(%rax)
+ ret
+
+It could narrow the loads and stores to emit this:
+
+ movq bfi(%rip), %rax
+ andb $-2, 10(%rax)
+ movq bfi(%rip), %rax
+ andb $-3, 10(%rax)
+ ret
+
+The trouble is that there is a TokenFactor between the store and the
+load, making it non-trivial to determine if there's anything between
+the load and the store which would prohibit narrowing.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+void foo(unsigned x) {
+ if (x == 0) bar();
+ else if (x == 1) qux();
+}
+
+currently compiles into:
+_foo:
+ movl 4(%esp), %eax
+ cmpl $1, %eax
+ je LBB0_3
+ testl %eax, %eax
+ jne LBB0_4
+
+the testl could be removed:
+_foo:
+ movl 4(%esp), %eax
+ cmpl $1, %eax
+ je LBB0_3
+ jb LBB0_4
+
+0 is the only unsigned number < 1.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+%0 = type { i32, i1 }
+
+define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
+entry:
+ %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
+ %cmp = extractvalue %0 %uadd, 1
+ %inc = zext i1 %cmp to i32
+ %add = add i32 %x, %sum
+ %z.0 = add i32 %add, %inc
+ ret i32 %z.0
+}
+
+declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+
+compiles to:
+
+_add32carry: ## @add32carry
+ addl %esi, %edi
+ sbbl %ecx, %ecx
+ movl %edi, %eax
+ subl %ecx, %eax
+ ret
+
+But it could be:
+
+_add32carry:
+ leal (%rsi,%rdi), %eax
+ cmpl %esi, %eax
+ adcl $0, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+The hot loop of 256.bzip2 contains code that looks a bit like this:
+
+int foo(char *P, char *Q, int x, int y) {
+ if (P[0] != Q[0])
+ return P[0] < Q[0];
+ if (P[1] != Q[1])
+ return P[1] < Q[1];
+ if (P[2] != Q[2])
+ return P[2] < Q[2];
+ return P[3] < Q[3];
+}
+
+In the real code, we get a lot more wrong than this. However, even in this
+code we generate:
+
+_foo: ## @foo
+## BB#0: ## %entry
+ movb (%rsi), %al
+ movb (%rdi), %cl
+ cmpb %al, %cl
+ je LBB0_2
+LBB0_1: ## %if.then
+ cmpb %al, %cl
+ jmp LBB0_5
+LBB0_2: ## %if.end
+ movb 1(%rsi), %al
+ movb 1(%rdi), %cl
+ cmpb %al, %cl
+ jne LBB0_1
+## BB#3: ## %if.end38
+ movb 2(%rsi), %al
+ movb 2(%rdi), %cl
+ cmpb %al, %cl
+ jne LBB0_1
+## BB#4: ## %if.end60
+ movb 3(%rdi), %al
+ cmpb 3(%rsi), %al
+LBB0_5: ## %if.end60
+ setl %al
+ movzbl %al, %eax
+ ret
+
+Note that we generate jumps to LBB0_1 which does a redundant compare. The
+redundant compare also forces the register values to be live, which prevents
+folding one of the loads into the compare. In contrast, GCC 4.2 produces:
+
+_foo:
+ movzbl (%rsi), %eax
+ cmpb %al, (%rdi)
+ jne L10
+L12:
+ movzbl 1(%rsi), %eax
+ cmpb %al, 1(%rdi)
+ jne L10
+ movzbl 2(%rsi), %eax
+ cmpb %al, 2(%rdi)
+ jne L10
+ movzbl 3(%rdi), %eax
+ cmpb 3(%rsi), %al
+L10:
+ setl %al
+ movzbl %al, %eax
+ ret
+
+which is "perfect".
+
+//===---------------------------------------------------------------------===//
+
+For the branch in the following code:
+int a();
+int b(int x, int y) {
+ if (x & (1<<(y&7)))
+ return a();
+ return y;
+}
+
+We currently generate:
+ movb %sil, %al
+ andb $7, %al
+ movzbl %al, %eax
+ btl %eax, %edi
+ jae .LBB0_2
+
+movl+andl would be shorter than the movb+andb+movzbl sequence.
+
+//===---------------------------------------------------------------------===//
+
+For the following:
+struct u1 {
+ float x, y;
+};
+float foo(struct u1 u) {
+ return u.x + u.y;
+}
+
+We currently generate:
+ movdqa %xmm0, %xmm1
+ pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
+ addss %xmm1, %xmm0
+ ret
+
+We could save an instruction here by commuting the addss.
+
+//===---------------------------------------------------------------------===//
+
+This (from PR9661):
+
+float clamp_float(float a) {
+ if (a > 1.0f)
+ return 1.0f;
+ else if (a < 0.0f)
+ return 0.0f;
+ else
+ return a;
+}
+
+Could compile to:
+
+clamp_float: # @clamp_float
+ movss .LCPI0_0(%rip), %xmm1
+ minss %xmm1, %xmm0
+ pxor %xmm1, %xmm1
+ maxss %xmm1, %xmm0
+ ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+ if (a > 5)
+ a = 5;
+ if (a < 0)
+ return 0;
+ return a;
+}
+
+Compiles to:
+
+_clamp2: ## @clamp2
+ pushq %rbp
+ movq %rsp, %rbp
+ cmpl $5, %edi
+ movl $5, %ecx
+ cmovlel %edi, %ecx
+ testl %ecx, %ecx
+ movl $0, %eax
+ cmovnsl %ecx, %eax
+ popq %rbp
+ ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR48986. We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+ if (__sync_fetch_and_add(p, -1) == 1)
+ bar();
+}
+
+into:
+ movl $-1, %eax
+ lock
+ xaddl %eax, (%rdi)
+ cmpl $1, %eax
+ je LBB0_2
+
+Instead we could generate:
+
+ lock
+ dec %rdi
+ je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
+unsigned t(unsigned a, unsigned b) {
+ return a <= b ? 5 : -5;
+}
+
+We generate:
+ movl $5, %ecx
+ cmpl %esi, %edi
+ movl $-5, %eax
+ cmovbel %ecx, %eax
+
+GCC:
+ cmpl %edi, %esi
+ sbbl %eax, %eax
+ andl $-10, %eax
+ addl $5, %eax
+
+//===---------------------------------------------------------------------===//