+define i32 @a(i32 %x) nounwind readnone {
+entry:
+ %0 = and i32 %x, 127 ; <i32> [#uses=1]
+ %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
+ %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
+ ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that. This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+ movl 4(%esp), %eax
+ shrl $4, %eax
+ andl $15, %eax
+ ret
+
+Ideal output:
+ movzbl 4(%esp), %eax
+ shrl $4, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+ testl $128, 4(%esp)
+ setne %al
+ movzbl %al, %eax
+ shll $8, %eax
+ ret
+
+Better:
+ movl 4(%esp), %eax
+ addl %eax, %eax
+ andl $256, %eax
+ ret
+
+This is another general instcombine transformation that is profitable on all
+targets. In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+ %0 = and i32 %a, 128
+ %1 = icmp eq i32 %0, 0
+ %iftmp.0.0 = select i1 %1, i32 0, i32 256
+ ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+ %0 = shl i32 %a, 1
+ %1 = and i32 %0, 256
+ ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//
+
+_Bool bar(int *x) { return *x & 1; }
+
+define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
+entry:
+ %tmp1 = load i32* %x ; <i32> [#uses=1]
+ %and = and i32 %tmp1, 1 ; <i32> [#uses=1]
+ %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1]
+ ret i1 %tobool
+}
+
+bar: # @bar
+# BB#0: # %entry
+ movl 4(%esp), %eax
+ movb (%eax), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+Missed optimization: should be movl+andl.
+
+//===---------------------------------------------------------------------===//
+
+Consider the following two functions compiled with clang:
+_Bool foo(int *x) { return !(*x & 4); }
+unsigned bar(int *x) { return !(*x & 4); }
+
+foo:
+ movl 4(%esp), %eax
+ testb $4, (%eax)
+ sete %al
+ movzbl %al, %eax
+ ret
+
+bar:
+ movl 4(%esp), %eax
+ movl (%eax), %eax
+ shrl $2, %eax
+ andl $1, %eax
+ xorl $1, %eax
+ ret
+
+The second function generates more code even though the two functions are
+are functionally identical.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int x(int y) { return (y & 63) << 14; }
+
+Code produced by gcc:
+ andl $63, %edi
+ sall $14, %edi
+ movl %edi, %eax
+ ret
+
+Code produced by clang:
+ shll $14, %edi
+ movl %edi, %eax
+ andl $1032192, %eax
+ ret
+
+The code produced by gcc is 3 bytes shorter. This sort of construct often
+shows up with bitfields.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = xor i32 %b, %a ; <i32> [#uses=1]
+ %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ xorl %esi, %edi
+ testb $-1, %dil
+ sete %al
+ movzbl %al, %eax
+ ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %sext = shl i32 %a, 24 ; <i32> [#uses=1]
+ %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
+ %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
+ %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ movsbl %sil, %eax
+ movsbl %dil, %ecx
+ cmpl %eax, %ecx
+ sete %al
+ movzbl %al, %eax