Thread LLVMContext through MVT and related parts of SDISel.

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 845da38f03bd94cf6abb39fa7b9374fa1724fd35..4464878ce2173c49c827b5a04cb9314243d461b3 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,6 +2,8 @@
  // Random ideas for the X86 backend.
  //===---------------------------------------------------------------------===//
  
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?)  This is available on Atom processors.
  
  //===---------------------------------------------------------------------===//
  
@@ -233,97 +235,12 @@ Optimize copysign(x, *y) to use an integer load from y.
  
  //===---------------------------------------------------------------------===//
  
-%X = weak global int 0
-
-void %foo(int %N) {
-       %N = cast int %N to uint
-       %tmp.24 = setgt int %N, 0
-       br bool %tmp.24, label %no_exit, label %return
-
-no_exit:
-       %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
-       %i.0.0 = cast uint %indvar to int
-       volatile store int %i.0.0, int* %X
-       %indvar.next = add uint %indvar, 1
-       %exitcond = seteq uint %indvar.next, %N
-       br bool %exitcond, label %return, label %no_exit
-
-return:
-       ret void
-}
-
-compiles into:
-
-       .text
-       .align  4
-       .globl  _foo
-_foo:
-       movl 4(%esp), %eax
-       cmpl $1, %eax
-       jl LBB_foo_4    # return
-LBB_foo_1:     # no_exit.preheader
-       xorl %ecx, %ecx
-LBB_foo_2:     # no_exit
-       movl L_X$non_lazy_ptr, %edx
-       movl %ecx, (%edx)
-       incl %ecx
-       cmpl %eax, %ecx
-       jne LBB_foo_2   # no_exit
-LBB_foo_3:     # return.loopexit
-LBB_foo_4:     # return
-       ret
-
-We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
-remateralization is implemented. This can be accomplished with 1) a target
-dependent LICM pass or 2) makeing SelectDAG represent the whole function. 
-
-//===---------------------------------------------------------------------===//
-
  The following tests perform worse with LSR:
  
  lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
  
  //===---------------------------------------------------------------------===//
  
-We are generating far worse code than gcc:
-
-volatile short X, Y;
-
-void foo(int N) {
-  int i;
-  for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-LBB1_1:        # entry.bb_crit_edge
-       xorl    %ecx, %ecx
-       xorw    %dx, %dx
-LBB1_2:        # bb
-       movl    L_X$non_lazy_ptr, %esi
-       movw    %cx, (%esi)
-       movl    L_Y$non_lazy_ptr, %esi
-       movw    %dx, (%esi)
-       addw    $4, %dx
-       incl    %ecx
-       cmpl    %eax, %ecx
-       jne     LBB1_2  # bb
-
-vs.
-
-       xorl    %edx, %edx
-       movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
-       movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
-L4:
-       movw    %dx, (%esi)
-       leal    0(,%edx,4), %eax
-       movw    %ax, (%ecx)
-       addl    $1, %edx
-       cmpl    %edx, %edi
-       jne     L4
-
-This is due to the lack of post regalloc LICM.
-
-//===---------------------------------------------------------------------===//
-
  Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
  FR64 to VR128.
  
@@ -369,37 +286,6 @@ There is also one case we do worse on PPC.
  
  //===---------------------------------------------------------------------===//
  
-If shorter, we should use things like:
-movzwl %ax, %eax
-instead of:
-andl $65535, %EAX
-
-The former can also be used when the two-addressy nature of the 'and' would
-require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
-
-//===---------------------------------------------------------------------===//
-
-Another instruction selector deficiency:
-
-void %bar() {
-       %tmp = load int (int)** %foo
-       %tmp = tail call int %tmp( int 3 )
-       ret void
-}
-
-_bar:
-       subl $12, %esp
-       movl L_foo$non_lazy_ptr, %eax
-       movl (%eax), %eax
-       call *%eax
-       addl $12, %esp
-       ret
-
-The current isel scheme will not allow the load to be folded in the call since
-the load's chain result is read by the callseq_start.
-
-//===---------------------------------------------------------------------===//
-
  For this:
  
  int test(int a)
@@ -425,6 +311,10 @@ estimate to determine whether the match is profitable.
  However, if we care more about code size, then imull is better. It's two bytes
  shorter than movl + leal.
  
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
  //===---------------------------------------------------------------------===//
  
  __builtin_ffs codegen is messy.
@@ -592,35 +482,6 @@ _usesbb:
  
  //===---------------------------------------------------------------------===//
  
-Currently we don't have elimination of redundant stack manipulations. Consider
-the code:
-
-int %main() {
-entry:
-       call fastcc void %test1( )
-       call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
-       ret int 0
-}
-
-declare fastcc void %test1()
-
-declare fastcc void %test2(sbyte*)
-
-
-This currently compiles to:
-
-       subl $16, %esp
-       call _test5
-       addl $12, %esp
-       subl $16, %esp
-       movl $_test5, (%esp)
-       call _test6
-       addl $12, %esp
-
-The add\sub pair is really unneeded here.
-
-//===---------------------------------------------------------------------===//
-
  Consider the expansion of:
  
  define i32 @test3(i32 %X) {
@@ -659,9 +520,9 @@ imagine there has to be some kind of complicated decoder reset and realignment
  to grab the bytes from the next cacheline.
  
  532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
-942  942 0x3d03 movl     %dh, (1809(%esp, %esi)                                                                          
-937  937 0x3d0a incl     %esi                           
-3    3   0x3d0b cmpb     %bl, %dl                                               
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
+937  937 0x3d0a incl     %esi
+3    3   0x3d0b cmpb     %bl, %dl
  27   27  0x3d0d jnz      0x000062db <main+11707>
  
  //===---------------------------------------------------------------------===//
@@ -1012,34 +873,6 @@ condition register is dead. xor reg reg is shorter than mov reg, #0.
  
  //===---------------------------------------------------------------------===//
  
-We aren't matching RMW instructions aggressively
-enough.  Here's a reduced testcase (more in PR1160):
-
-define void @test(i32* %huge_ptr, i32* %target_ptr) {
-        %A = load i32* %huge_ptr                ; <i32> [#uses=1]
-        %B = load i32* %target_ptr              ; <i32> [#uses=1]
-        %C = or i32 %A, %B              ; <i32> [#uses=1]
-        store i32 %C, i32* %target_ptr
-        ret void
-}
-
-$ llvm-as < t.ll | llc -march=x86-64
-
-_test:
-        movl (%rdi), %eax
-        orl (%rsi), %eax
-        movl %eax, (%rsi)
-        ret
-
-That should be something like:
-
-_test:
-        movl (%rdi), %eax
-        orl %eax, (%rsi)
-        ret
-
-//===---------------------------------------------------------------------===//
-
  The following code:
  
  bb114.preheader:               ; preds = %cond_next94
@@ -1612,3 +1445,490 @@ The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
  if it commuted the addl in LBB1_1.
  
  //===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15:        # bb310
+        cvtss2sd        LCPI1_0, %xmm1
+        addsd   %xmm1, %xmm0
+        movsd   176(%esp), %xmm2
+        mulsd   %xmm0, %xmm2
+        movapd  %xmm2, %xmm3
+        mulsd   %xmm3, %xmm3
+        movapd  %xmm3, %xmm4
+        mulsd   LCPI1_23, %xmm4
+        addsd   LCPI1_24, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_25, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_26, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_27, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_28, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   %xmm1, %xmm4
+        mulsd   %xmm2, %xmm4
+        movsd   152(%esp), %xmm1
+        addsd   %xmm4, %xmm1
+        movsd   %xmm1, 152(%esp)
+        incl    %eax
+        cmpl    %eax, %esi
+        jge     LBB1_15 # bb310
+LBB1_16:        # bb358.loopexit
+        movsd   152(%esp), %xmm0
+        addsd   %xmm0, %xmm0
+        addsd   LCPI1_22, %xmm0
+        movsd   %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Legalize loses track of the fact that bools are always zero extended when in
+memory.  This causes us to compile abort_gzip (from 164.gzip) from:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false             ; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind  {
+entry:
+       %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
+       br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i:          ; preds = %entry
+       tail call void @exit( i32 1 ) noreturn nounwind 
+       unreachable
+bb4.i:         ; preds = %entry
+       store i1 true, i1* @in_exit.4870.b
+       tail call void @exit( i32 1 ) noreturn nounwind 
+       unreachable
+}
+declare void @exit(i32) noreturn nounwind 
+
+into:
+
+_abort_gzip:
+       subl    $12, %esp
+       movb    _in_exit.4870.b, %al
+       notb    %al
+       testb   $1, %al
+       jne     LBB1_2  ## bb4.i
+LBB1_1:        ## bb.i
+  ...
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+  return x-y-1;
+}
+
+into (-m64):
+
+_test:
+       decl    %edi
+       movl    %edi, %eax
+       subl    %esi, %eax
+       ret
+
+it would be better to codegen as: x+~y  (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+       subq    $200, %rsp
+        movaps  %xmm7, 160(%rsp)
+        movaps  %xmm6, 144(%rsp)
+        movaps  %xmm5, 128(%rsp)
+        movaps  %xmm4, 112(%rsp)
+        movaps  %xmm3, 96(%rsp)
+        movaps  %xmm2, 80(%rsp)
+        movaps  %xmm1, 64(%rsp)
+        movaps  %xmm0, 48(%rsp)
+        movq    %r9, 40(%rsp)
+        movq    %r8, 32(%rsp)
+        movq    %rcx, 24(%rsp)
+        movq    %rdx, 16(%rsp)
+        movq    %rsi, 8(%rsp)
+        leaq    (%rsp), %rax
+        movq    %rax, 192(%rsp)
+        leaq    208(%rsp), %rax
+        movq    %rax, 184(%rsp)
+        movl    $48, 180(%rsp)
+        movl    $8, 176(%rsp)
+        movl    176(%rsp), %eax
+        cmpl    $47, %eax
+        jbe     .LBB1_3 # bb
+.LBB1_1:        # bb3
+        movq    184(%rsp), %rcx
+        leaq    8(%rcx), %rax
+        movq    %rax, 184(%rsp)
+.LBB1_2:        # bb4
+        movl    (%rcx), %eax
+        addq    $200, %rsp
+        ret
+.LBB1_3:        # bb
+        movl    %eax, %ecx
+        addl    $8, %eax
+        addq    192(%rsp), %rcx
+        movl    %eax, 176(%rsp)
+        jmp     .LBB1_2 # bb4
+
+gcc 4.3 generates:
+       subq    $96, %rsp
+.LCFI0:
+        leaq    104(%rsp), %rax
+        movq    %rsi, -80(%rsp)
+        movl    $8, -120(%rsp)
+        movq    %rax, -112(%rsp)
+        leaq    -88(%rsp), %rax
+        movq    %rax, -104(%rsp)
+        movl    $8, %eax
+        cmpl    $48, %eax
+        jb      .L6
+        movq    -112(%rsp), %rdx
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+        .p2align 4,,10
+        .p2align 3
+.L6:
+        mov     %eax, %edx
+        addq    -104(%rsp), %rdx
+        addl    $8, %eax
+        movl    %eax, -120(%rsp)
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+
+and it gets compiled into this on x86:
+       pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        leal    12(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        leal    16(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        movl    12(%ebp), %eax
+        addl    $4, %esp
+        popl    %ebp
+        ret
+
+gcc 4.3 generates:
+       pushl   %ebp
+        movl    %esp, %ebp
+        movl    12(%ebp), %eax
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall.  The expected code is something
+like the following:
+
+       movl    12(%ebp), %eax
+       movl    16(%ebp), %ecx
+       xorl    %edx, %edx
+       divl    %ecx
+       movl    8(%ebp), %eax
+       divl    %ecx
+       movl    %edx, %eax
+       ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+    return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+    return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+normal:
+  ret i32 %sum
+overflow:
+  call void @llvm.trap()
+  unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+       movl    4(%esp), %eax
+       addl    8(%esp), %eax
+       jo      LBB1_2  ## overflow
+LBB1_1:        ## normal
+       ret
+LBB1_2:        ## overflow
+       ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+       movl    %ebx, %eax
+       imull   (%edi,%ecx,4)
+       shrdl   $31, %edx, %eax
+       addl    %eax, (%esi,%ecx,4)
+       incl    %ecx
+       cmpl    $149, %ecx
+       jle     .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1:        ## bb1
+       movl    24(%esp), %eax
+       movl    (%eax,%edi,4), %ebx
+       movl    %ebx, %ebp
+       imull   %esi, %ebp
+       movl    %ebx, %eax
+       mull    %ecx
+       addl    %ebp, %edx
+       sarl    $31, %ebx
+       imull   %ecx, %ebx
+       addl    %edx, %ebx
+       shldl   $1, %eax, %ebx
+       movl    20(%esp), %eax
+       addl    %ebx, (%eax,%edi,4)
+       incl    %edi
+       cmpl    $150, %edi
+       jne     LBB1_1  ## bb1
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+       %tmp = alloca <16 x float>, align 16
+       %tmp2 = alloca <16 x float>, align 16
+       store <16 x float> %A, <16 x float>* %tmp
+       %s = bitcast <16 x float>* %tmp to i8*
+       %s2 = bitcast <16 x float>* %tmp2 to i8*
+       call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+       %R = load <16 x float>* %tmp2
+       ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+       subl    $140, %esp
+       movaps  %xmm3, 112(%esp)
+       movaps  %xmm2, 96(%esp)
+       movaps  %xmm1, 80(%esp)
+       movaps  %xmm0, 64(%esp)
+       movl    60(%esp), %eax
+       movl    %eax, 124(%esp)
+       movl    56(%esp), %eax
+       movl    %eax, 120(%esp)
+       movl    52(%esp), %eax
+        <many many more 32-bit copies>
+       movaps  (%esp), %xmm0
+       movaps  16(%esp), %xmm1
+       movaps  32(%esp), %xmm2
+       movaps  48(%esp), %xmm3
+       addl    $140, %esp
+       ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors.  GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+   preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+   code contains more than 3 branches.
+   
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+  Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+       movl    4(%esp), %eax
+       andl    $127, %eax
+       cmpl    $31, %eax
+       seta    %al
+       movzbl  %al, %eax
+       ret
+
+Ideal output:
+       xorl    %eax, %eax
+       testl   $96, 4(%esp)
+       setne   %al
+       ret
+
+This should definitely be done in instcombine, canonicalizing the range
+condition into a != condition.  We get this IR:
+
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+       %0 = and i32 %x, 127            ; <i32> [#uses=1]
+       %1 = icmp ugt i32 %0, 31                ; <i1> [#uses=1]
+       %2 = zext i1 %1 to i32          ; <i32> [#uses=1]
+       ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that.  This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+       movl    4(%esp), %eax
+       shrl    $4, %eax
+       andl    $15, %eax
+       ret
+
+Ideal output:
+       movzbl  4(%esp), %eax
+       shrl    $4, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+       testl   $128, 4(%esp)
+       setne   %al
+       movzbl  %al, %eax
+       shll    $8, %eax
+       ret
+
+Better:
+       movl    4(%esp), %eax
+       addl    %eax, %eax
+       andl    $256, %eax
+       ret
+
+This is another general instcombine transformation that is profitable on all
+targets.  In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+       %0 = and i32 %a, 128
+       %1 = icmp eq i32 %0, 0
+       %iftmp.0.0 = select i1 %1, i32 0, i32 256
+       ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+       %0 = shl i32 %a, 1
+       %1 = and i32 %0, 256
+       ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//