X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=e67fab109047ed9cdce789af6c0fb50bae122c35;hb=d93ceeb125c11a96eb85618bb9a8a7d664a1d8f4;hp=2b867f4d8dc4428a5ab812e27de461bbd8ea6a8b;hpb=be685cc72a52a297c217384763b25cfe1c94e80b;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 2b867f4d8dc..e67fab10904 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,6 +2,8 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// +We should add support for the "movbe" instruction, which does a byte-swapping +copy (3-addr bswap + memory support?) This is available on Atom processors. //===---------------------------------------------------------------------===// @@ -121,20 +123,6 @@ when it can invert the result of the compare for free. //===---------------------------------------------------------------------===// -How about intrinsics? An example is: - *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C)); - -compiles to - pmuludq (%eax), %xmm0 - movl 8(%esp), %eax - movdqa (%eax), %xmm1 - pmulhuw %xmm0, %xmm1 - -The transformation probably requires a X86 specific pass or a DAG combiner -target specific hook. - -//===---------------------------------------------------------------------===// - In many cases, LLVM generates code like this: _test: @@ -233,102 +221,12 @@ Optimize copysign(x, *y) to use an integer load from y. //===---------------------------------------------------------------------===// -%X = weak global int 0 - -void %foo(int %N) { - %N = cast int %N to uint - %tmp.24 = setgt int %N, 0 - br bool %tmp.24, label %no_exit, label %return - -no_exit: - %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ] - %i.0.0 = cast uint %indvar to int - volatile store int %i.0.0, int* %X - %indvar.next = add uint %indvar, 1 - %exitcond = seteq uint %indvar.next, %N - br bool %exitcond, label %return, label %no_exit - -return: - ret void -} - -compiles into: - - .text - .align 4 - .globl _foo -_foo: - movl 4(%esp), %eax - cmpl $1, %eax - jl LBB_foo_4 # return -LBB_foo_1: # no_exit.preheader - xorl %ecx, %ecx -LBB_foo_2: # no_exit - movl L_X$non_lazy_ptr, %edx - movl %ecx, (%edx) - incl %ecx - cmpl %eax, %ecx - jne LBB_foo_2 # no_exit -LBB_foo_3: # return.loopexit -LBB_foo_4: # return - ret - -We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after -remateralization is implemented. This can be accomplished with 1) a target -dependent LICM pass or 2) makeing SelectDAG represent the whole function. - -//===---------------------------------------------------------------------===// - The following tests perform worse with LSR: lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. //===---------------------------------------------------------------------===// -We are generating far worse code than gcc: - -volatile short X, Y; - -void foo(int N) { - int i; - for (i = 0; i < N; i++) { X = i; Y = i*4; } -} - -LBB1_1: # entry.bb_crit_edge - xorl %ecx, %ecx - xorw %dx, %dx -LBB1_2: # bb - movl L_X$non_lazy_ptr, %esi - movw %cx, (%esi) - movl L_Y$non_lazy_ptr, %esi - movw %dx, (%esi) - addw $4, %dx - incl %ecx - cmpl %eax, %ecx - jne LBB1_2 # bb - -vs. - - xorl %edx, %edx - movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi - movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx -L4: - movw %dx, (%esi) - leal 0(,%edx,4), %eax - movw %ax, (%ecx) - addl $1, %edx - cmpl %edx, %edi - jne L4 - -This is due to the lack of post regalloc LICM. - -//===---------------------------------------------------------------------===// - -Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / -FR64 to VR128. - -//===---------------------------------------------------------------------===// - Adding to the list of cmp / test poor codegen issues: int test(__m128 *A, __m128 *B) { @@ -565,35 +463,6 @@ _usesbb: //===---------------------------------------------------------------------===// -Currently we don't have elimination of redundant stack manipulations. Consider -the code: - -int %main() { -entry: - call fastcc void %test1( ) - call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) ) - ret int 0 -} - -declare fastcc void %test1() - -declare fastcc void %test2(sbyte*) - - -This currently compiles to: - - subl $16, %esp - call _test5 - addl $12, %esp - subl $16, %esp - movl $_test5, (%esp) - call _test6 - addl $12, %esp - -The add\sub pair is really unneeded here. - -//===---------------------------------------------------------------------===// - Consider the expansion of: define i32 @test3(i32 %X) { @@ -656,7 +525,7 @@ We should inline lrintf and probably other libc functions. //===---------------------------------------------------------------------===// -Start using the flags more. For example, compile: +Use the FLAGS values from arithmetic instructions more. For example, compile: int add_zf(int *x, int y, int a, int b) { if ((*x += y) == 0) @@ -680,31 +549,8 @@ _add_zf: movl %ecx, %eax ret -and: - -int add_zf(int *x, int y, int a, int b) { - if ((*x + y) < 0) - return a; - else - return b; -} - -to: - -add_zf: - addl (%rdi), %esi - movl %edx, %eax - cmovns %ecx, %eax - ret - -instead of: - -_add_zf: - addl (%rdi), %esi - testl %esi, %esi - cmovs %edx, %ecx - movl %ecx, %eax - ret +As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll +without a test instruction. //===---------------------------------------------------------------------===// @@ -811,55 +657,6 @@ Though this probably isn't worth it. //===---------------------------------------------------------------------===// -We need to teach the codegen to convert two-address INC instructions to LEA -when the flags are dead (likewise dec). For example, on X86-64, compile: - -int foo(int A, int B) { - return A+1; -} - -to: - -_foo: - leal 1(%edi), %eax - ret - -instead of: - -_foo: - incl %edi - movl %edi, %eax - ret - -Another example is: - -;; X's live range extends beyond the shift, so the register allocator -;; cannot coalesce it with Y. Because of this, a copy needs to be -;; emitted before the shift to save the register value before it is -;; clobbered. However, this copy is not needed if the register -;; allocator turns the shift into an LEA. This also occurs for ADD. - -; Check that the shift gets turned into an LEA. -; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \ -; RUN: not grep {mov E.X, E.X} - -@G = external global i32 ; [#uses=3] - -define i32 @test1(i32 %X, i32 %Y) { - %Z = add i32 %X, %Y ; [#uses=1] - volatile store i32 %Y, i32* @G - volatile store i32 %Z, i32* @G - ret i32 %X -} - -define i32 @test2(i32 %X) { - %Z = add i32 %X, 1 ; [#uses=1] - volatile store i32 %Z, i32* @G - ret i32 %X -} - -//===---------------------------------------------------------------------===// - Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with a neg instead of a sub instruction. Consider: @@ -980,39 +777,6 @@ __Z11no_overflowjj: //===---------------------------------------------------------------------===// -Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the -condition register is dead. xor reg reg is shorter than mov reg, #0. - -//===---------------------------------------------------------------------===// - -We aren't matching RMW instructions aggressively -enough. Here's a reduced testcase (more in PR1160): - -define void @test(i32* %huge_ptr, i32* %target_ptr) { - %A = load i32* %huge_ptr ; [#uses=1] - %B = load i32* %target_ptr ; [#uses=1] - %C = or i32 %A, %B ; [#uses=1] - store i32 %C, i32* %target_ptr - ret void -} - -$ llvm-as < t.ll | llc -march=x86-64 - -_test: - movl (%rdi), %eax - orl (%rsi), %eax - movl %eax, (%rsi) - ret - -That should be something like: - -_test: - movl (%rdi), %eax - orl %eax, (%rsi) - ret - -//===---------------------------------------------------------------------===// - The following code: bb114.preheader: ; preds = %cond_next94 @@ -1131,6 +895,24 @@ compare: //===---------------------------------------------------------------------===// +Linux is missing some basic tail call support: + +#include +double foo(double a) { return sin(a); } + +This compiles into this on x86-64 Linux (but not darwin): +foo: + subq $8, %rsp + call sin + addq $8, %rsp + ret +vs: + +foo: + jmp sin + +//===---------------------------------------------------------------------===// + Tail call optimization improvements: Tail call optimization currently pushes all arguments on the top of the stack (their normal place for non-tail call optimized calls) that source from the callers arguments @@ -1339,57 +1121,6 @@ be folded into: shl [mem], 1 //===---------------------------------------------------------------------===// -This testcase misses a read/modify/write opportunity (from PR1425): - -void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){ - int i; - for(i=0; i>0; -} - -We compile it down to: - -LBB1_2: # bb - movl (%esi,%edi,4), %ebx - addl (%ecx,%edi,4), %ebx - addl (%edx,%edi,4), %ebx - movl %ebx, (%ecx,%edi,4) - incl %edi - cmpl %eax, %edi - jne LBB1_2 # bb - -the inner loop should add to the memory location (%ecx,%edi,4), saving -a mov. Something like: - - movl (%esi,%edi,4), %ebx - addl (%edx,%edi,4), %ebx - addl %ebx, (%ecx,%edi,4) - -Here is another interesting example: - -void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){ - int i; - for(i=0; i>0; -} - -We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]: - -LBB9_2: # bb - movl (%ecx,%edi,4), %ebx - subl (%esi,%edi,4), %ebx - subl (%edx,%edi,4), %ebx - movl %ebx, (%ecx,%edi,4) - incl %edi - cmpl %eax, %edi - jne LBB9_2 # bb - -Additionally, LSR should rewrite the exit condition of these loops to use -a stride-4 IV, would would allow all the scales in the loop to go away. -This would result in smaller code and more efficient microops. - -//===---------------------------------------------------------------------===// - In SSE mode, we turn abs and neg into a load from the constant pool plus a xor or and instruction, for example: @@ -1422,13 +1153,6 @@ void test(double *P) { //===---------------------------------------------------------------------===// -handling llvm.memory.barrier on pre SSE2 cpus - -should generate: -lock ; mov %esp, %esp - -//===---------------------------------------------------------------------===// - The generated code on x86 for checking for signed overflow on a multiply the obvious way is much longer than it needs to be. @@ -1537,15 +1261,8 @@ FirstOnet: xorl %eax, %eax ret -There are a few possible improvements here: -1. We should be able to eliminate the dead load into %ecx -2. We could change the "movl 8(%esp), %eax" into - "movzwl 10(%esp), %eax"; this lets us change the cmpl - into a testl, which is shorter, and eliminate the shift. - -We could also in theory eliminate the branch by using a conditional -for the address of the load, but that seems unlikely to be worthwhile -in general. +We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this +lets us change the cmpl into a testl, which is shorter, and eliminate the shift. //===---------------------------------------------------------------------===// @@ -1567,22 +1284,23 @@ bb7: ; preds = %entry to: -_foo: +foo: # @foo +# BB#0: # %entry + movl 4(%esp), %ecx cmpb $0, 16(%esp) - movl 12(%esp), %ecx + je .LBB0_2 +# BB#1: # %bb movl 8(%esp), %eax - movl 4(%esp), %edx - je LBB1_2 # bb7 -LBB1_1: # bb - addl %edx, %eax + addl %ecx, %eax ret -LBB1_2: # bb7 - movl %edx, %eax - subl %ecx, %eax +.LBB0_2: # %bb7 + movl 12(%esp), %edx + movl %ecx, %eax + subl %edx, %eax ret -The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2 -if it commuted the addl in LBB1_1. +There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a +couple more movls by putting 4(%esp) into %eax instead of %ecx. //===---------------------------------------------------------------------===// @@ -1632,8 +1350,7 @@ Also check why xmm7 is not used at all in the function. //===---------------------------------------------------------------------===// -Legalize loses track of the fact that bools are always zero extended when in -memory. This causes us to compile abort_gzip (from 164.gzip) from: +Take the following: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin8" @@ -1652,16 +1369,15 @@ bb4.i: ; preds = %entry } declare void @exit(i32) noreturn nounwind -into: - -_abort_gzip: +This compiles into: +_abort_gzip: ## @abort_gzip +## BB#0: ## %entry subl $12, %esp movb _in_exit.4870.b, %al - notb %al - testb $1, %al - jne LBB1_2 ## bb4.i -LBB1_1: ## bb.i - ... + cmpb $1, %al + jne LBB0_2 + +We somehow miss folding the movb into the cmpb. //===---------------------------------------------------------------------===// @@ -1862,3 +1578,403 @@ LBB1_2: ## overflow it would be nice to produce "into" someday. //===---------------------------------------------------------------------===// + +This code: + +void vec_mpys1(int y[], const int x[], int scaler) { +int i; +for (i = 0; i < 150; i++) + y[i] += (((long long)scaler * (long long)x[i]) >> 31); +} + +Compiles to this loop with GCC 3.x: + +.L5: + movl %ebx, %eax + imull (%edi,%ecx,4) + shrdl $31, %edx, %eax + addl %eax, (%esi,%ecx,4) + incl %ecx + cmpl $149, %ecx + jle .L5 + +llvm-gcc compiles it to the much uglier: + +LBB1_1: ## bb1 + movl 24(%esp), %eax + movl (%eax,%edi,4), %ebx + movl %ebx, %ebp + imull %esi, %ebp + movl %ebx, %eax + mull %ecx + addl %ebp, %edx + sarl $31, %ebx + imull %ecx, %ebx + addl %edx, %ebx + shldl $1, %eax, %ebx + movl 20(%esp), %eax + addl %ebx, (%eax,%edi,4) + incl %edi + cmpl $150, %edi + jne LBB1_1 ## bb1 + +The issue is that we hoist the cast of "scaler" to long long outside of the +loop, the value comes into the loop as two values, and +RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the +constructed BUILD_PAIR which represents the cast value. + +//===---------------------------------------------------------------------===// + +Test instructions can be eliminated by using EFLAGS values from arithmetic +instructions. This is currently not done for mul, and, or, xor, neg, shl, +sra, srl, shld, shrd, atomic ops, and others. It is also currently not done +for read-modify-write instructions. It is also current not done if the +OF or CF flags are needed. + +The shift operators have the complication that when the shift count is +zero, EFLAGS is not set, so they can only subsume a test instruction if +the shift count is known to be non-zero. Also, using the EFLAGS value +from a shift is apparently very slow on some x86 implementations. + +In read-modify-write instructions, the root node in the isel match is +the store, and isel has no way for the use of the EFLAGS result of the +arithmetic to be remapped to the new node. + +Add and subtract instructions set OF on signed overflow and CF on unsiged +overflow, while test instructions always clear OF and CF. In order to +replace a test with an add or subtract in a situation where OF or CF is +needed, codegen must be able to prove that the operation cannot see +signed or unsigned overflow, respectively. + +//===---------------------------------------------------------------------===// + +memcpy/memmove do not lower to SSE copies when possible. A silly example is: +define <16 x float> @foo(<16 x float> %A) nounwind { + %tmp = alloca <16 x float>, align 16 + %tmp2 = alloca <16 x float>, align 16 + store <16 x float> %A, <16 x float>* %tmp + %s = bitcast <16 x float>* %tmp to i8* + %s2 = bitcast <16 x float>* %tmp2 to i8* + call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) + %R = load <16 x float>* %tmp2 + ret <16 x float> %R +} + +declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind + +which compiles to: + +_foo: + subl $140, %esp + movaps %xmm3, 112(%esp) + movaps %xmm2, 96(%esp) + movaps %xmm1, 80(%esp) + movaps %xmm0, 64(%esp) + movl 60(%esp), %eax + movl %eax, 124(%esp) + movl 56(%esp), %eax + movl %eax, 120(%esp) + movl 52(%esp), %eax + + movaps (%esp), %xmm0 + movaps 16(%esp), %xmm1 + movaps 32(%esp), %xmm2 + movaps 48(%esp), %xmm3 + addl $140, %esp + ret + +On Nehalem, it may even be cheaper to just use movups when unaligned than to +fall back to lower-granularity chunks. + +//===---------------------------------------------------------------------===// + +Implement processor-specific optimizations for parity with GCC on these +processors. GCC does two optimizations: + +1. ix86_pad_returns inserts a noop before ret instructions if immediately + preceeded by a conditional branch or is the target of a jump. +2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of + code contains more than 3 branches. + +The first one is done for all AMDs, Core2, and "Generic" +The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, + Core 2, and "Generic" + +//===---------------------------------------------------------------------===// + +Testcase: +int a(int x) { return (x & 127) > 31; } + +Current output: + movl 4(%esp), %eax + andl $127, %eax + cmpl $31, %eax + seta %al + movzbl %al, %eax + ret + +Ideal output: + xorl %eax, %eax + testl $96, 4(%esp) + setne %al + ret + +This should definitely be done in instcombine, canonicalizing the range +condition into a != condition. We get this IR: + +define i32 @a(i32 %x) nounwind readnone { +entry: + %0 = and i32 %x, 127 ; [#uses=1] + %1 = icmp ugt i32 %0, 31 ; [#uses=1] + %2 = zext i1 %1 to i32 ; [#uses=1] + ret i32 %2 +} + +Instcombine prefers to strength reduce relational comparisons to equality +comparisons when possible, this should be another case of that. This could +be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it +looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already +be redesigned to use ComputeMaskedBits and friends. + + +//===---------------------------------------------------------------------===// +Testcase: +int x(int a) { return (a&0xf0)>>4; } + +Current output: + movl 4(%esp), %eax + shrl $4, %eax + andl $15, %eax + ret + +Ideal output: + movzbl 4(%esp), %eax + shrl $4, %eax + ret + +//===---------------------------------------------------------------------===// + +Testcase: +int x(int a) { return (a & 0x80) ? 0x100 : 0; } +int y(int a) { return (a & 0x80) *2; } + +Current: + testl $128, 4(%esp) + setne %al + movzbl %al, %eax + shll $8, %eax + ret + +Better: + movl 4(%esp), %eax + addl %eax, %eax + andl $256, %eax + ret + +This is another general instcombine transformation that is profitable on all +targets. In LLVM IR, these functions look like this: + +define i32 @x(i32 %a) nounwind readnone { +entry: + %0 = and i32 %a, 128 + %1 = icmp eq i32 %0, 0 + %iftmp.0.0 = select i1 %1, i32 0, i32 256 + ret i32 %iftmp.0.0 +} + +define i32 @y(i32 %a) nounwind readnone { +entry: + %0 = shl i32 %a, 1 + %1 = and i32 %0, 256 + ret i32 %1 +} + +Replacing an icmp+select with a shift should always be considered profitable in +instcombine. + +//===---------------------------------------------------------------------===// + +Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch +properly. + +When the return value is not used (i.e. only care about the value in the +memory), x86 does not have to use add to implement these. Instead, it can use +add, sub, inc, dec instructions with the "lock" prefix. + +This is currently implemented using a bit of instruction selection trick. The +issue is the target independent pattern produces one output and a chain and we +want to map it into one that just output a chain. The current trick is to select +it into a MERGE_VALUES with the first definition being an implicit_def. The +proper solution is to add new ISD opcodes for the no-output variant. DAG +combiner can then transform the node before it gets to target node selection. + +Problem #2 is we are adding a whole bunch of x86 atomic instructions when in +fact these instructions are identical to the non-lock versions. We need a way to +add target specific information to target nodes and have this information +carried over to machine instructions. Asm printer (or JIT) can use this +information to add the "lock" prefix. + +//===---------------------------------------------------------------------===// + +_Bool bar(int *x) { return *x & 1; } + +define zeroext i1 @bar(i32* nocapture %x) nounwind readonly { +entry: + %tmp1 = load i32* %x ; [#uses=1] + %and = and i32 %tmp1, 1 ; [#uses=1] + %tobool = icmp ne i32 %and, 0 ; [#uses=1] + ret i1 %tobool +} + +bar: # @bar +# BB#0: # %entry + movl 4(%esp), %eax + movb (%eax), %al + andb $1, %al + movzbl %al, %eax + ret + +Missed optimization: should be movl+andl. + +//===---------------------------------------------------------------------===// + +Consider the following two functions compiled with clang: +_Bool foo(int *x) { return !(*x & 4); } +unsigned bar(int *x) { return !(*x & 4); } + +foo: + movl 4(%esp), %eax + testb $4, (%eax) + sete %al + movzbl %al, %eax + ret + +bar: + movl 4(%esp), %eax + movl (%eax), %eax + shrl $2, %eax + andl $1, %eax + xorl $1, %eax + ret + +The second function generates more code even though the two functions are +are functionally identical. + +//===---------------------------------------------------------------------===// + +Take the following C code: +int x(int y) { return (y & 63) << 14; } + +Code produced by gcc: + andl $63, %edi + sall $14, %edi + movl %edi, %eax + ret + +Code produced by clang: + shll $14, %edi + movl %edi, %eax + andl $1032192, %eax + ret + +The code produced by gcc is 3 bytes shorter. This sort of construct often +shows up with bitfields. + +//===---------------------------------------------------------------------===// + +Take the following C code: +int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %tmp = xor i32 %b, %a ; [#uses=1] + %tmp6 = and i32 %tmp, 255 ; [#uses=1] + %cmp = icmp eq i32 %tmp6, 0 ; [#uses=1] + %conv5 = zext i1 %cmp to i32 ; [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + xorl %esi, %edi + testb $-1, %dil + sete %al + movzbl %al, %eax + ret + +A cmpb instead of the xorl+testb would be one instruction shorter. + +//===---------------------------------------------------------------------===// + +Given the following C code: +int f(int a, int b) { return (signed char)a == (signed char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %sext = shl i32 %a, 24 ; [#uses=1] + %conv1 = ashr i32 %sext, 24 ; [#uses=1] + %sext6 = shl i32 %b, 24 ; [#uses=1] + %conv4 = ashr i32 %sext6, 24 ; [#uses=1] + %cmp = icmp eq i32 %conv1, %conv4 ; [#uses=1] + %conv5 = zext i1 %cmp to i32 ; [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + movsbl %sil, %eax + movsbl %dil, %ecx + cmpl %eax, %ecx + sete %al + movzbl %al, %eax + ret + + +It should be possible to eliminate the sign extensions. + +//===---------------------------------------------------------------------===// + +LLVM misses a load+store narrowing opportunity in this code: + +%struct.bf = type { i64, i16, i16, i32 } + +@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] + +define void @t1() nounwind ssp { +entry: + %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; [#uses=1] + %2 = bitcast i16* %1 to i32* ; [#uses=2] + %3 = load i32* %2, align 1 ; [#uses=1] + %4 = and i32 %3, -65537 ; [#uses=1] + store i32 %4, i32* %2, align 1 + %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; [#uses=1] + %7 = bitcast i16* %6 to i32* ; [#uses=2] + %8 = load i32* %7, align 1 ; [#uses=1] + %9 = and i32 %8, -131073 ; [#uses=1] + store i32 %9, i32* %7, align 1 + ret void +} + +LLVM currently emits this: + + movq bfi(%rip), %rax + andl $-65537, 8(%rax) + movq bfi(%rip), %rax + andl $-131073, 8(%rax) + ret + +It could narrow the loads and stores to emit this: + + movq bfi(%rip), %rax + andb $-2, 10(%rax) + movq bfi(%rip), %rax + andb $-3, 10(%rax) + ret + +The trouble is that there is a TokenFactor between the store and the +load, making it non-trivial to determine if there's anything between +the load and the store which would prohibit narrowing. + +//===---------------------------------------------------------------------===//