lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 Dead argument elimination should be enhanced to handle cases when an argument is
   6 dead to an externally visible function.  Though the argument can't be removed
   7 from the externally visible function, the caller doesn't need to pass it in.
   8 For example in this testcase:
   9
  10   void foo(int X) __attribute__((noinline));
  11   void foo(int X) { sideeffect(); }
  12   void bar(int A) { foo(A+1); }
  13
  14 We compile bar to:
  15
  16 define void @bar(i32 %A) nounwind ssp {
  17   %0 = add nsw i32 %A, 1                          ; <i32> [#uses=1]
  18   tail call void @foo(i32 %0) nounwind noinline ssp
  19   ret void
  20 }
  21
  22 The add is dead, we could pass in 'i32 undef' instead.  This occurs for C++
  23 templates etc, which usually have linkonce_odr/weak_odr linkage, not internal
  24 linkage.
  25
  26 //===---------------------------------------------------------------------===//
  27
  28 With the recent changes to make the implicit def/use set explicit in
  29 machineinstrs, we should change the target descriptions for 'call' instructions
  30 so that the .td files don't list all the call-clobbered registers as implicit
  31 defs.  Instead, these should be added by the code generator (e.g. on the dag).
  32
  33 This has a number of uses:
  34
  35 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  36    for their different impdef sets.
  37 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  38    sets don't need copies of call instructions.
  39 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  40    of calls.
  41
  42 //===---------------------------------------------------------------------===//
  43
  44 Make the PPC branch selector target independant
  45
  46 //===---------------------------------------------------------------------===//
  47
  48 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  49 precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
  50 safe in general, even on darwin.  See the libm implementation of hypot for
  51 examples (which special case when x/y are exactly zero to get signed zeros etc
  52 right).
  53
  54 //===---------------------------------------------------------------------===//
  55
  56 Solve this DAG isel folding deficiency:
  57
  58 int X, Y;
  59
  60 void fn1(void)
  61 {
  62   X = X | (Y << 3);
  63 }
  64
  65 compiles to
  66
  67 fn1:
  68         movl Y, %eax
  69         shll $3, %eax
  70         orl X, %eax
  71         movl %eax, X
  72         ret
  73
  74 The problem is the store's chain operand is not the load X but rather
  75 a TokenFactor of the load X and load Y, which prevents the folding.
  76
  77 There are two ways to fix this:
  78
  79 1. The dag combiner can start using alias analysis to realize that y/x
  80    don't alias, making the store to X not dependent on the load from Y.
  81 2. The generated isel could be made smarter in the case it can't
  82    disambiguate the pointers.
  83
  84 Number 1 is the preferred solution.
  85
  86 This has been "fixed" by a TableGen hack. But that is a short term workaround
  87 which will be removed once the proper fix is made.
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 On targets with expensive 64-bit multiply, we could LSR this:
  92
  93 for (i = ...; ++i) {
  94    x = 1ULL << i;
  95
  96 into:
  97  long long tmp = 1;
  98  for (i = ...; ++i, tmp+=tmp)
  99    x = tmp;
 100
 101 This would be a win on ppc32, but not x86 or ppc64.
 102
 103 //===---------------------------------------------------------------------===//
 104
 105 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
 106
 107 //===---------------------------------------------------------------------===//
 108
 109 Reassociate should turn things like:
 110
 111 int factorial(int X) {
 112  return X*X*X*X*X*X*X*X;
 113 }
 114
 115 into llvm.powi calls, allowing the code generator to produce balanced
 116 multiplication trees.
 117
 118 First, the intrinsic needs to be extended to support integers, and second the
 119 code generator needs to be enhanced to lower these to multiplication trees.
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 Interesting? testcase for add/shift/mul reassoc:
 124
 125 int bar(int x, int y) {
 126   return x*x*x+y+x*x*x*x*x*y*y*y*y;
 127 }
 128 int foo(int z, int n) {
 129   return bar(z, n) + bar(2*z, 2*n);
 130 }
 131
 132 This is blocked on not handling X*X*X -> powi(X, 3) (see note above).  The issue
 133 is that we end up getting t = 2*X  s = t*t   and don't turn this into 4*X*X,
 134 which is the same number of multiplies and is canonical, because the 2*X has
 135 multiple uses.  Here's a simple example:
 136
 137 define i32 @test15(i32 %X1) {
 138   %B = mul i32 %X1, 47   ; X1*47
 139   %C = mul i32 %B, %B
 140   ret i32 %C
 141 }
 142
 143
 144 //===---------------------------------------------------------------------===//
 145
 146 Reassociate should handle the example in GCC PR16157:
 147
 148 extern int a0, a1, a2, a3, a4; extern int b0, b1, b2, b3, b4;
 149 void f () {  /* this can be optimized to four additions... */
 150         b4 = a4 + a3 + a2 + a1 + a0;
 151         b3 = a3 + a2 + a1 + a0;
 152         b2 = a2 + a1 + a0;
 153         b1 = a1 + a0;
 154 }
 155
 156 This requires reassociating to forms of expressions that are already available,
 157 something that reassoc doesn't think about yet.
 158
 159 //===---------------------------------------------------------------------===//
 160
 161 These two functions should generate the same code on big-endian systems:
 162
 163 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 164 int h(int *j, int *l) {  return *j - *l; }
 165
 166 this could be done in SelectionDAGISel.cpp, along with other special cases,
 167 for 1,2,4,8 bytes.
 168
 169 //===---------------------------------------------------------------------===//
 170
 171 It would be nice to revert this patch:
 172 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 173
 174 And teach the dag combiner enough to simplify the code expanded before
 175 legalize.  It seems plausible that this knowledge would let it simplify other
 176 stuff too.
 177
 178 //===---------------------------------------------------------------------===//
 179
 180 For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 181 to the type size. It works but can be overly conservative as the alignment of
 182 specific vector types are target dependent.
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 We should produce an unaligned load from code like this:
 187
 188 v4sf example(float *P) {
 189   return (v4sf){P[0], P[1], P[2], P[3] };
 190 }
 191
 192 //===---------------------------------------------------------------------===//
 193
 194 Add support for conditional increments, and other related patterns.  Instead
 195 of:
 196
 197         movl 136(%esp), %eax
 198         cmpl $0, %eax
 199         je LBB16_2      #cond_next
 200 LBB16_1:        #cond_true
 201         incl _foo
 202 LBB16_2:        #cond_next
 203
 204 emit:
 205         movl    _foo, %eax
 206         cmpl    $1, %edi
 207         sbbl    $-1, %eax
 208         movl    %eax, _foo
 209
 210 //===---------------------------------------------------------------------===//
 211
 212 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 213
 214 Expand these to calls of sin/cos and stores:
 215       double sincos(double x, double *sin, double *cos);
 216       float sincosf(float x, float *sin, float *cos);
 217       long double sincosl(long double x, long double *sin, long double *cos);
 218
 219 Doing so could allow SROA of the destination pointers.  See also:
 220 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 221
 222 This is now easily doable with MRVs.  We could even make an intrinsic for this
 223 if anyone cared enough about sincos.
 224
 225 //===---------------------------------------------------------------------===//
 226
 227 Turn this into a single byte store with no load (the other 3 bytes are
 228 unmodified):
 229
 230 define void @test(i32* %P) {
 231         %tmp = load i32* %P
 232         %tmp14 = or i32 %tmp, 3305111552
 233         %tmp15 = and i32 %tmp14, 3321888767
 234         store i32 %tmp15, i32* %P
 235         ret void
 236 }
 237
 238 //===---------------------------------------------------------------------===//
 239
 240 quantum_sigma_x in 462.libquantum contains the following loop:
 241
 242       for(i=0; i<reg->size; i++)
 243         {
 244           /* Flip the target bit of each basis state */
 245           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 246         }
 247
 248 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 249 so cool to turn it into something like:
 250
 251    long long Res = ((MAX_UNSIGNED) 1 << target);
 252    if (target < 32) {
 253      for(i=0; i<reg->size; i++)
 254        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 255    } else {
 256      for(i=0; i<reg->size; i++)
 257        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 258    }
 259
 260 ... which would only do one 32-bit XOR per loop iteration instead of two.
 261
 262 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 263 this requires TBAA.
 264
 265 //===---------------------------------------------------------------------===//
 266
 267 This isn't recognized as bswap by instcombine (yes, it really is bswap):
 268
 269 unsigned long reverse(unsigned v) {
 270     unsigned t;
 271     t = v ^ ((v << 16) | (v >> 16));
 272     t &= ~0xff0000;
 273     v = (v << 24) | (v >> 8);
 274     return v ^ (t >> 8);
 275 }
 276
 277 //===---------------------------------------------------------------------===//
 278
 279 [LOOP RECOGNITION]
 280
 281 These idioms should be recognized as popcount (see PR1488):
 282
 283 unsigned countbits_slow(unsigned v) {
 284   unsigned c;
 285   for (c = 0; v; v >>= 1)
 286     c += v & 1;
 287   return c;
 288 }
 289 unsigned countbits_fast(unsigned v){
 290   unsigned c;
 291   for (c = 0; v; c++)
 292     v &= v - 1; // clear the least significant bit set
 293   return c;
 294 }
 295
 296 BITBOARD = unsigned long long
 297 int PopCnt(register BITBOARD a) {
 298   register int c=0;
 299   while(a) {
 300     c++;
 301     a &= a - 1;
 302   }
 303   return c;
 304 }
 305 unsigned int popcount(unsigned int input) {
 306   unsigned int count = 0;
 307   for (unsigned int i =  0; i < 4 * 8; i++)
 308     count += (input >> i) & i;
 309   return count;
 310 }
 311
 312 This is a form of idiom recognition for loops, the same thing that could be
 313 useful for recognizing memset/memcpy.
 314
 315 //===---------------------------------------------------------------------===//
 316
 317 These should turn into single 16-bit (unaligned?) loads on little/big endian
 318 processors.
 319
 320 unsigned short read_16_le(const unsigned char *adr) {
 321   return adr[0] | (adr[1] << 8);
 322 }
 323 unsigned short read_16_be(const unsigned char *adr) {
 324   return (adr[0] << 8) | adr[1];
 325 }
 326
 327 //===---------------------------------------------------------------------===//
 328
 329 -instcombine should handle this transform:
 330    icmp pred (sdiv X / C1 ), C2
 331 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 332
 333 Currently InstCombine avoids this transform but will do it when the signs of
 334 the operands and the sign of the divide match. See the FIXME in
 335 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 336 for Instruction::UDiv (around line 4447) for more details.
 337
 338 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 339 this construct.
 340
 341 //===---------------------------------------------------------------------===//
 342
 343 [LOOP RECOGNITION]
 344
 345 viterbi speeds up *significantly* if the various "history" related copy loops
 346 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 347 pass.
 348
 349 //===---------------------------------------------------------------------===//
 350
 351 [LOOP OPTIMIZATION]
 352
 353 SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
 354 opportunities in its double_array_divs_variable function: it needs loop
 355 interchange, memory promotion (which LICM already does), vectorization and
 356 variable trip count loop unrolling (since it has a constant trip count). ICC
 357 apparently produces this very nice code with -ffast-math:
 358
 359 ..B1.70:                        # Preds ..B1.70 ..B1.69
 360        mulpd     %xmm0, %xmm1                                  #108.2
 361        mulpd     %xmm0, %xmm1                                  #108.2
 362        mulpd     %xmm0, %xmm1                                  #108.2
 363        mulpd     %xmm0, %xmm1                                  #108.2
 364        addl      $8, %edx                                      #
 365        cmpl      $131072, %edx                                 #108.2
 366        jb        ..B1.70       # Prob 99%                      #108.2
 367
 368 It would be better to count down to zero, but this is a lot better than what we
 369 do.
 370
 371 //===---------------------------------------------------------------------===//
 372
 373 Consider:
 374
 375 typedef unsigned U32;
 376 typedef unsigned long long U64;
 377 int test (U32 *inst, U64 *regs) {
 378     U64 effective_addr2;
 379     U32 temp = *inst;
 380     int r1 = (temp >> 20) & 0xf;
 381     int b2 = (temp >> 16) & 0xf;
 382     effective_addr2 = temp & 0xfff;
 383     if (b2) effective_addr2 += regs[b2];
 384     b2 = (temp >> 12) & 0xf;
 385     if (b2) effective_addr2 += regs[b2];
 386     effective_addr2 &= regs[4];
 387      if ((effective_addr2 & 3) == 0)
 388         return 1;
 389     return 0;
 390 }
 391
 392 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 393 we don't eliminate the computation of the top half of effective_addr2 because
 394 we don't have whole-function selection dags.  On x86, this means we use one
 395 extra register for the function when effective_addr2 is declared as U64 than
 396 when it is declared U32.
 397
 398 PHI Slicing could be extended to do this.
 399
 400 //===---------------------------------------------------------------------===//
 401
 402 LSR should know what GPR types a target has from TargetData.  This code:
 403
 404 volatile short X, Y; // globals
 405
 406 void foo(int N) {
 407   int i;
 408   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 409 }
 410
 411 produces two near identical IV's (after promotion) on PPC/ARM:
 412
 413 LBB1_2:
 414         ldr r3, LCPI1_0
 415         ldr r3, [r3]
 416         strh r2, [r3]
 417         ldr r3, LCPI1_1
 418         ldr r3, [r3]
 419         strh r1, [r3]
 420         add r1, r1, #4
 421         add r2, r2, #1   <- [0,+,1]
 422         sub r0, r0, #1   <- [0,-,1]
 423         cmp r0, #0
 424         bne LBB1_2
 425
 426 LSR should reuse the "+" IV for the exit test.
 427
 428 //===---------------------------------------------------------------------===//
 429
 430 Tail call elim should be more aggressive, checking to see if the call is
 431 followed by an uncond branch to an exit block.
 432
 433 ; This testcase is due to tail-duplication not wanting to copy the return
 434 ; instruction into the terminating blocks because there was other code
 435 ; optimized out of the function after the taildup happened.
 436 ; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
 437
 438 define i32 @t4(i32 %a) {
 439 entry:
 440         %tmp.1 = and i32 %a, 1          ; <i32> [#uses=1]
 441         %tmp.2 = icmp ne i32 %tmp.1, 0          ; <i1> [#uses=1]
 442         br i1 %tmp.2, label %then.0, label %else.0
 443
 444 then.0:         ; preds = %entry
 445         %tmp.5 = add i32 %a, -1         ; <i32> [#uses=1]
 446         %tmp.3 = call i32 @t4( i32 %tmp.5 )             ; <i32> [#uses=1]
 447         br label %return
 448
 449 else.0:         ; preds = %entry
 450         %tmp.7 = icmp ne i32 %a, 0              ; <i1> [#uses=1]
 451         br i1 %tmp.7, label %then.1, label %return
 452
 453 then.1:         ; preds = %else.0
 454         %tmp.11 = add i32 %a, -2                ; <i32> [#uses=1]
 455         %tmp.9 = call i32 @t4( i32 %tmp.11 )            ; <i32> [#uses=1]
 456         br label %return
 457
 458 return:         ; preds = %then.1, %else.0, %then.0
 459         %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
 460                             [ %tmp.9, %then.1 ]
 461         ret i32 %result.0
 462 }
 463
 464 //===---------------------------------------------------------------------===//
 465
 466 Tail recursion elimination should handle:
 467
 468 int pow2m1(int n) {
 469  if (n == 0)
 470    return 0;
 471  return 2 * pow2m1 (n - 1) + 1;
 472 }
 473
 474 Also, multiplies can be turned into SHL's, so they should be handled as if
 475 they were associative.  "return foo() << 1" can be tail recursion eliminated.
 476
 477 //===---------------------------------------------------------------------===//
 478
 479 Argument promotion should promote arguments for recursive functions, like
 480 this:
 481
 482 ; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
 483
 484 define internal i32 @foo(i32* %x) {
 485 entry:
 486         %tmp = load i32* %x             ; <i32> [#uses=0]
 487         %tmp.foo = call i32 @foo( i32* %x )             ; <i32> [#uses=1]
 488         ret i32 %tmp.foo
 489 }
 490
 491 define i32 @bar(i32* %x) {
 492 entry:
 493         %tmp3 = call i32 @foo( i32* %x )                ; <i32> [#uses=1]
 494         ret i32 %tmp3
 495 }
 496
 497 //===---------------------------------------------------------------------===//
 498
 499 We should investigate an instruction sinking pass.  Consider this silly
 500 example in pic mode:
 501
 502 #include <assert.h>
 503 void foo(int x) {
 504   assert(x);
 505   //...
 506 }
 507
 508 we compile this to:
 509 _foo:
 510         subl    $28, %esp
 511         call    "L1$pb"
 512 "L1$pb":
 513         popl    %eax
 514         cmpl    $0, 32(%esp)
 515         je      LBB1_2  # cond_true
 516 LBB1_1: # return
 517         # ...
 518         addl    $28, %esp
 519         ret
 520 LBB1_2: # cond_true
 521 ...
 522
 523 The PIC base computation (call+popl) is only used on one path through the
 524 code, but is currently always computed in the entry block.  It would be
 525 better to sink the picbase computation down into the block for the
 526 assertion, as it is the only one that uses it.  This happens for a lot of
 527 code with early outs.
 528
 529 Another example is loads of arguments, which are usually emitted into the
 530 entry block on targets like x86.  If not used in all paths through a
 531 function, they should be sunk into the ones that do.
 532
 533 In this case, whole-function-isel would also handle this.
 534
 535 //===---------------------------------------------------------------------===//
 536
 537 Investigate lowering of sparse switch statements into perfect hash tables:
 538 http://burtleburtle.net/bob/hash/perfect.html
 539
 540 //===---------------------------------------------------------------------===//
 541
 542 We should turn things like "load+fabs+store" and "load+fneg+store" into the
 543 corresponding integer operations.  On a yonah, this loop:
 544
 545 double a[256];
 546 void foo() {
 547   int i, b;
 548   for (b = 0; b < 10000000; b++)
 549   for (i = 0; i < 256; i++)
 550     a[i] = -a[i];
 551 }
 552
 553 is twice as slow as this loop:
 554
 555 long long a[256];
 556 void foo() {
 557   int i, b;
 558   for (b = 0; b < 10000000; b++)
 559   for (i = 0; i < 256; i++)
 560     a[i] ^= (1ULL << 63);
 561 }
 562
 563 and I suspect other processors are similar.  On X86 in particular this is a
 564 big win because doing this with integers allows the use of read/modify/write
 565 instructions.
 566
 567 //===---------------------------------------------------------------------===//
 568
 569 DAG Combiner should try to combine small loads into larger loads when
 570 profitable.  For example, we compile this C++ example:
 571
 572 struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
 573 extern THotKey m_HotKey;
 574 THotKey GetHotKey () { return m_HotKey; }
 575
 576 into (-O3 -fno-exceptions -static -fomit-frame-pointer):
 577
 578 __Z9GetHotKeyv:
 579         pushl   %esi
 580         movl    8(%esp), %eax
 581         movb    _m_HotKey+3, %cl
 582         movb    _m_HotKey+4, %dl
 583         movb    _m_HotKey+2, %ch
 584         movw    _m_HotKey, %si
 585         movw    %si, (%eax)
 586         movb    %ch, 2(%eax)
 587         movb    %cl, 3(%eax)
 588         movb    %dl, 4(%eax)
 589         popl    %esi
 590         ret     $4
 591
 592 GCC produces:
 593
 594 __Z9GetHotKeyv:
 595         movl    _m_HotKey, %edx
 596         movl    4(%esp), %eax
 597         movl    %edx, (%eax)
 598         movzwl  _m_HotKey+4, %edx
 599         movw    %dx, 4(%eax)
 600         ret     $4
 601
 602 The LLVM IR contains the needed alignment info, so we should be able to
 603 merge the loads and stores into 4-byte loads:
 604
 605         %struct.THotKey = type { i16, i8, i8, i8 }
 606 define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
 607 ...
 608         %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
 609         %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
 610         %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
 611         %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
 612
 613 Alternatively, we should use a small amount of base-offset alias analysis
 614 to make it so the scheduler doesn't need to hold all the loads in regs at
 615 once.
 616
 617 //===---------------------------------------------------------------------===//
 618
 619 We should add an FRINT node to the DAG to model targets that have legal
 620 implementations of ceil/floor/rint.
 621
 622 //===---------------------------------------------------------------------===//
 623
 624 Consider:
 625
 626 int test() {
 627   long long input[8] = {1,1,1,1,1,1,1,1};
 628   foo(input);
 629 }
 630
 631 We currently compile this into a memcpy from a global array since the
 632 initializer is fairly large and not memset'able.  This is good, but the memcpy
 633 gets lowered to load/stores in the code generator.  This is also ok, except
 634 that the codegen lowering for memcpy doesn't handle the case when the source
 635 is a constant global.  This gives us atrocious code like this:
 636
 637         call    "L1$pb"
 638 "L1$pb":
 639         popl    %eax
 640         movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
 641         movl    %ecx, 40(%esp)
 642         movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
 643         movl    %ecx, 28(%esp)
 644         movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
 645         movl    %ecx, 44(%esp)
 646         movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
 647         movl    %ecx, 52(%esp)
 648         movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
 649         movl    %ecx, 48(%esp)
 650         movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
 651         movl    %ecx, 20(%esp)
 652         movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
 653 ...
 654
 655 instead of:
 656         movl    $1, 16(%esp)
 657         movl    $0, 20(%esp)
 658         movl    $1, 24(%esp)
 659         movl    $0, 28(%esp)
 660         movl    $1, 32(%esp)
 661         movl    $0, 36(%esp)
 662         ...
 663
 664 //===---------------------------------------------------------------------===//
 665
 666 http://llvm.org/PR717:
 667
 668 The following code should compile into "ret int undef". Instead, LLVM
 669 produces "ret int 0":
 670
 671 int f() {
 672   int x = 4;
 673   int y;
 674   if (x == 3) y = 0;
 675   return y;
 676 }
 677
 678 //===---------------------------------------------------------------------===//
 679
 680 The loop unroller should partially unroll loops (instead of peeling them)
 681 when code growth isn't too bad and when an unroll count allows simplification
 682 of some code within the loop.  One trivial example is:
 683
 684 #include <stdio.h>
 685 int main() {
 686     int nRet = 17;
 687     int nLoop;
 688     for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
 689         if ( nLoop & 1 )
 690             nRet += 2;
 691         else
 692             nRet -= 1;
 693     }
 694     return nRet;
 695 }
 696
 697 Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
 698 reduction in code size.  The resultant code would then also be suitable for
 699 exit value computation.
 700
 701 //===---------------------------------------------------------------------===//
 702
 703 We miss a bunch of rotate opportunities on various targets, including ppc, x86,
 704 etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
 705 matching code in dag combine doesn't look through truncates aggressively
 706 enough.  Here are some testcases reduces from GCC PR17886:
 707
 708 unsigned long long f(unsigned long long x, int y) {
 709   return (x << y) | (x >> 64-y);
 710 }
 711 unsigned f2(unsigned x, int y){
 712   return (x << y) | (x >> 32-y);
 713 }
 714 unsigned long long f3(unsigned long long x){
 715   int y = 9;
 716   return (x << y) | (x >> 64-y);
 717 }
 718 unsigned f4(unsigned x){
 719   int y = 10;
 720   return (x << y) | (x >> 32-y);
 721 }
 722 unsigned long long f5(unsigned long long x, unsigned long long y) {
 723   return (x << 8) | ((y >> 48) & 0xffull);
 724 }
 725 unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
 726   switch(z) {
 727   case 1:
 728     return (x << 8) | ((y >> 48) & 0xffull);
 729   case 2:
 730     return (x << 16) | ((y >> 40) & 0xffffull);
 731   case 3:
 732     return (x << 24) | ((y >> 32) & 0xffffffull);
 733   case 4:
 734     return (x << 32) | ((y >> 24) & 0xffffffffull);
 735   default:
 736     return (x << 40) | ((y >> 16) & 0xffffffffffull);
 737   }
 738 }
 739
 740 On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these
 741 generate truly horrible code, instead of using shld and friends.  On
 742 ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
 743 badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
 744
 745 //===---------------------------------------------------------------------===//
 746
 747 We do a number of simplifications in simplify libcalls to strength reduce
 748 standard library functions, but we don't currently merge them together.  For
 749 example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
 750 be done safely if "b" isn't modified between the strlen and memcpy of course.
 751
 752 //===---------------------------------------------------------------------===//
 753
 754 We compile this program: (from GCC PR11680)
 755 http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
 756
 757 Into code that runs the same speed in fast/slow modes, but both modes run 2x
 758 slower than when compile with GCC (either 4.0 or 4.2):
 759
 760 $ llvm-g++ perf.cpp -O3 -fno-exceptions
 761 $ time ./a.out fast
 762 1.821u 0.003s 0:01.82 100.0%    0+0k 0+0io 0pf+0w
 763
 764 $ g++ perf.cpp -O3 -fno-exceptions
 765 $ time ./a.out fast
 766 0.821u 0.001s 0:00.82 100.0%    0+0k 0+0io 0pf+0w
 767
 768 It looks like we are making the same inlining decisions, so this may be raw
 769 codegen badness or something else (haven't investigated).
 770
 771 //===---------------------------------------------------------------------===//
 772
 773 We miss some instcombines for stuff like this:
 774 void bar (void);
 775 void foo (unsigned int a) {
 776   /* This one is equivalent to a >= (3 << 2).  */
 777   if ((a >> 2) >= 3)
 778     bar ();
 779 }
 780
 781 A few other related ones are in GCC PR14753.
 782
 783 //===---------------------------------------------------------------------===//
 784
 785 Divisibility by constant can be simplified (according to GCC PR12849) from
 786 being a mulhi to being a mul lo (cheaper).  Testcase:
 787
 788 void bar(unsigned n) {
 789   if (n % 3 == 0)
 790     true();
 791 }
 792
 793 This is equivalent to the following, where 2863311531 is the multiplicative
 794 inverse of 3, and 1431655766 is ((2^32)-1)/3+1:
 795 void bar(unsigned n) {
 796   if (n * 2863311531U < 1431655766U)
 797     true();
 798 }
 799
 800 The same transformation can work with an even modulo with the addition of a
 801 rotate: rotate the result of the multiply to the right by the number of bits
 802 which need to be zero for the condition to be true, and shrink the compare RHS
 803 by the same amount.  Unless the target supports rotates, though, that
 804 transformation probably isn't worthwhile.
 805
 806 The transformation can also easily be made to work with non-zero equality
 807 comparisons: just transform, for example, "n % 3 == 1" to "(n-1) % 3 == 0".
 808
 809 //===---------------------------------------------------------------------===//
 810
 811 Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
 812 bunch of other stuff from this example (see PR1604):
 813
 814 #include <cstdio>
 815 struct test {
 816     int val;
 817     virtual ~test() {}
 818 };
 819
 820 int main() {
 821     test t;
 822     std::scanf("%d", &t.val);
 823     std::printf("%d\n", t.val);
 824 }
 825
 826 //===---------------------------------------------------------------------===//
 827
 828 These functions perform the same computation, but produce different assembly.
 829
 830 define i8 @select(i8 %x) readnone nounwind {
 831   %A = icmp ult i8 %x, 250
 832   %B = select i1 %A, i8 0, i8 1
 833   ret i8 %B
 834 }
 835
 836 define i8 @addshr(i8 %x) readnone nounwind {
 837   %A = zext i8 %x to i9
 838   %B = add i9 %A, 6       ;; 256 - 250 == 6
 839   %C = lshr i9 %B, 8
 840   %D = trunc i9 %C to i8
 841   ret i8 %D
 842 }
 843
 844 //===---------------------------------------------------------------------===//
 845
 846 From gcc bug 24696:
 847 int
 848 f (unsigned long a, unsigned long b, unsigned long c)
 849 {
 850   return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0);
 851 }
 852 int
 853 f (unsigned long a, unsigned long b, unsigned long c)
 854 {
 855   return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
 856 }
 857 Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
 858 "clang -emit-llvm-bc | opt -std-compile-opts".
 859
 860 //===---------------------------------------------------------------------===//
 861
 862 From GCC Bug 20192:
 863 #define PMD_MASK    (~((1UL << 23) - 1))
 864 void clear_pmd_range(unsigned long start, unsigned long end)
 865 {
 866    if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK))
 867        f();
 868 }
 869 The expression should optimize to something like
 870 "!((start|end)&~PMD_MASK). Currently not optimized with "clang
 871 -emit-llvm-bc | opt -std-compile-opts".
 872
 873 //===---------------------------------------------------------------------===//
 874
 875 From GCC Bug 3756:
 876 int
 877 pn (int n)
 878 {
 879  return (n >= 0 ? 1 : -1);
 880 }
 881 Should combine to (n >> 31) | 1.  Currently not optimized with "clang
 882 -emit-llvm-bc | opt -std-compile-opts | llc".
 883
 884 //===---------------------------------------------------------------------===//
 885
 886 void a(int variable)
 887 {
 888  if (variable == 4 || variable == 6)
 889    bar();
 890 }
 891 This should optimize to "if ((variable | 2) == 6)".  Currently not
 892 optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
 893
 894 //===---------------------------------------------------------------------===//
 895
 896 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
 897 i;}
 898 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 899 These should combine to the same thing.  Currently, the first function
 900 produces better code on X86.
 901
 902 //===---------------------------------------------------------------------===//
 903
 904 From GCC Bug 15784:
 905 #define abs(x) x>0?x:-x
 906 int f(int x, int y)
 907 {
 908  return (abs(x)) >= 0;
 909 }
 910 This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
 911 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 912
 913 //===---------------------------------------------------------------------===//
 914
 915 From GCC Bug 14753:
 916 void
 917 rotate_cst (unsigned int a)
 918 {
 919  a = (a << 10) | (a >> 22);
 920  if (a == 123)
 921    bar ();
 922 }
 923 void
 924 minus_cst (unsigned int a)
 925 {
 926  unsigned int tem;
 927
 928  tem = 20 - a;
 929  if (tem == 5)
 930    bar ();
 931 }
 932 void
 933 mask_gt (unsigned int a)
 934 {
 935  /* This is equivalent to a > 15.  */
 936  if ((a & ~7) > 8)
 937    bar ();
 938 }
 939 void
 940 rshift_gt (unsigned int a)
 941 {
 942  /* This is equivalent to a > 23.  */
 943  if ((a >> 2) > 5)
 944    bar ();
 945 }
 946 All should simplify to a single comparison.  All of these are
 947 currently not optimized with "clang -emit-llvm-bc | opt
 948 -std-compile-opts".
 949
 950 //===---------------------------------------------------------------------===//
 951
 952 From GCC Bug 32605:
 953 int c(int* x) {return (char*)x+2 == (char*)x;}
 954 Should combine to 0.  Currently not optimized with "clang
 955 -emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
 956
 957 //===---------------------------------------------------------------------===//
 958
 959 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
 960 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
 961 with "clang -emit-llvm-bc | opt -std-compile-opts".
 962
 963 //===---------------------------------------------------------------------===//
 964
 965 unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
 966 Should combine to "x | (y & 3)".  Currently not optimized with "clang
 967 -emit-llvm-bc | opt -std-compile-opts".
 968
 969 //===---------------------------------------------------------------------===//
 970
 971 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
 972 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
 973 "clang -emit-llvm-bc | opt -std-compile-opts".
 974
 975 //===---------------------------------------------------------------------===//
 976
 977 int a(int a,int b) {return (~(a|b))|a;}
 978 Should fold to "a|~b".  Currently not optimized with "clang
 979 -emit-llvm-bc | opt -std-compile-opts".
 980
 981 //===---------------------------------------------------------------------===//
 982
 983 int a(int a, int b) {return (a&&b) || (a&&!b);}
 984 Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
 985 | opt -std-compile-opts".
 986
 987 //===---------------------------------------------------------------------===//
 988
 989 int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
 990 Should fold to "a ? b : c", or at least something sane.  Currently not
 991 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 992
 993 //===---------------------------------------------------------------------===//
 994
 995 int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
 996 Should fold to a && (b || c).  Currently not optimized with "clang
 997 -emit-llvm-bc | opt -std-compile-opts".
 998
 999 //===---------------------------------------------------------------------===//
1000
1001 int a(int x) {return x | ((x & 8) ^ 8);}
1002 Should combine to x | 8.  Currently not optimized with "clang
1003 -emit-llvm-bc | opt -std-compile-opts".
1004
1005 //===---------------------------------------------------------------------===//
1006
1007 int a(int x) {return x ^ ((x & 8) ^ 8);}
1008 Should also combine to x | 8.  Currently not optimized with "clang
1009 -emit-llvm-bc | opt -std-compile-opts".
1010
1011 //===---------------------------------------------------------------------===//
1012
1013 int a(int x) {return (x & 8) == 0 ? -1 : -9;}
1014 Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
1015 -emit-llvm-bc | opt -std-compile-opts".
1016
1017 //===---------------------------------------------------------------------===//
1018
1019 int a(int x) {return (x & 8) == 0 ? -9 : -1;}
1020 Should combine to x | -9.  Currently not optimized with "clang
1021 -emit-llvm-bc | opt -std-compile-opts".
1022
1023 //===---------------------------------------------------------------------===//
1024
1025 int a(int x) {return ((x | -9) ^ 8) & x;}
1026 Should combine to x & -9.  Currently not optimized with "clang
1027 -emit-llvm-bc | opt -std-compile-opts".
1028
1029 //===---------------------------------------------------------------------===//
1030
1031 unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
1032 Should combine to "a * 0x88888888 >> 31".  Currently not optimized
1033 with "clang -emit-llvm-bc | opt -std-compile-opts".
1034
1035 //===---------------------------------------------------------------------===//
1036
1037 unsigned a(char* x) {if ((*x & 32) == 0) return b();}
1038 There's an unnecessary zext in the generated code with "clang
1039 -emit-llvm-bc | opt -std-compile-opts".
1040
1041 //===---------------------------------------------------------------------===//
1042
1043 unsigned a(unsigned long long x) {return 40 * (x >> 1);}
1044 Should combine to "20 * (((unsigned)x) & -2)".  Currently not
1045 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
1046
1047 //===---------------------------------------------------------------------===//
1048
1049 This was noticed in the entryblock for grokdeclarator in 403.gcc:
1050
1051         %tmp = icmp eq i32 %decl_context, 4
1052         %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context
1053         %tmp1 = icmp eq i32 %decl_context_addr.0, 1
1054         %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0
1055
1056 tmp1 should be simplified to something like:
1057   (!tmp || decl_context == 1)
1058
1059 This allows recursive simplifications, tmp1 is used all over the place in
1060 the function, e.g. by:
1061
1062         %tmp23 = icmp eq i32 %decl_context_addr.1, 0            ; <i1> [#uses=1]
1063         %tmp24 = xor i1 %tmp1, true             ; <i1> [#uses=1]
1064         %or.cond8 = and i1 %tmp23, %tmp24               ; <i1> [#uses=1]
1065
1066 later.
1067
1068 //===---------------------------------------------------------------------===//
1069
1070 [STORE SINKING]
1071
1072 Store sinking: This code:
1073
1074 void f (int n, int *cond, int *res) {
1075     int i;
1076     *res = 0;
1077     for (i = 0; i < n; i++)
1078         if (*cond)
1079             *res ^= 234; /* (*) */
1080 }
1081
1082 On this function GVN hoists the fully redundant value of *res, but nothing
1083 moves the store out.  This gives us this code:
1084
1085 bb:             ; preds = %bb2, %entry
1086         %.rle = phi i32 [ 0, %entry ], [ %.rle6, %bb2 ]
1087         %i.05 = phi i32 [ 0, %entry ], [ %indvar.next, %bb2 ]
1088         %1 = load i32* %cond, align 4
1089         %2 = icmp eq i32 %1, 0
1090         br i1 %2, label %bb2, label %bb1
1091
1092 bb1:            ; preds = %bb
1093         %3 = xor i32 %.rle, 234
1094         store i32 %3, i32* %res, align 4
1095         br label %bb2
1096
1097 bb2:            ; preds = %bb, %bb1
1098         %.rle6 = phi i32 [ %3, %bb1 ], [ %.rle, %bb ]
1099         %indvar.next = add i32 %i.05, 1
1100         %exitcond = icmp eq i32 %indvar.next, %n
1101         br i1 %exitcond, label %return, label %bb
1102
1103 DSE should sink partially dead stores to get the store out of the loop.
1104
1105 Here's another partial dead case:
1106 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12395
1107
1108 //===---------------------------------------------------------------------===//
1109
1110 Scalar PRE hoists the mul in the common block up to the else:
1111
1112 int test (int a, int b, int c, int g) {
1113   int d, e;
1114   if (a)
1115     d = b * c;
1116   else
1117     d = b - c;
1118   e = b * c + g;
1119   return d + e;
1120 }
1121
1122 It would be better to do the mul once to reduce codesize above the if.
1123 This is GCC PR38204.
1124
1125 //===---------------------------------------------------------------------===//
1126
1127 [STORE SINKING]
1128
1129 GCC PR37810 is an interesting case where we should sink load/store reload
1130 into the if block and outside the loop, so we don't reload/store it on the
1131 non-call path.
1132
1133 for () {
1134   *P += 1;
1135   if ()
1136     call();
1137   else
1138     ...
1139 ->
1140 tmp = *P
1141 for () {
1142   tmp += 1;
1143   if () {
1144     *P = tmp;
1145     call();
1146     tmp = *P;
1147   } else ...
1148 }
1149 *P = tmp;
1150
1151 We now hoist the reload after the call (Transforms/GVN/lpre-call-wrap.ll), but
1152 we don't sink the store.  We need partially dead store sinking.
1153
1154 //===---------------------------------------------------------------------===//
1155
1156 [LOAD PRE CRIT EDGE SPLITTING]
1157
1158 GCC PR37166: Sinking of loads prevents SROA'ing the "g" struct on the stack
1159 leading to excess stack traffic. This could be handled by GVN with some crazy
1160 symbolic phi translation.  The code we get looks like (g is on the stack):
1161
1162 bb2:            ; preds = %bb1
1163 ..
1164         %9 = getelementptr %struct.f* %g, i32 0, i32 0
1165         store i32 %8, i32* %9, align  bel %bb3
1166
1167 bb3:            ; preds = %bb1, %bb2, %bb
1168         %c_addr.0 = phi %struct.f* [ %g, %bb2 ], [ %c, %bb ], [ %c, %bb1 ]
1169         %b_addr.0 = phi %struct.f* [ %b, %bb2 ], [ %g, %bb ], [ %b, %bb1 ]
1170         %10 = getelementptr %struct.f* %c_addr.0, i32 0, i32 0
1171         %11 = load i32* %10, align 4
1172
1173 %11 is partially redundant, an in BB2 it should have the value %8.
1174
1175 GCC PR33344 and PR35287 are similar cases.
1176
1177
1178 //===---------------------------------------------------------------------===//
1179
1180 [LOAD PRE]
1181
1182 There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
1183 GCC testsuite, ones we don't get yet are (checked through loadpre25):
1184
1185 [CRIT EDGE BREAKING]
1186 loadpre3.c predcom-4.c
1187
1188 [PRE OF READONLY CALL]
1189 loadpre5.c
1190
1191 [TURN SELECT INTO BRANCH]
1192 loadpre14.c loadpre15.c
1193
1194 actually a conditional increment: loadpre18.c loadpre19.c
1195
1196
1197 //===---------------------------------------------------------------------===//
1198
1199 [SCALAR PRE]
1200 There are many PRE testcases in testsuite/gcc.dg/tree-ssa/ssa-pre-*.c in the
1201 GCC testsuite.
1202
1203 //===---------------------------------------------------------------------===//
1204
1205 There are some interesting cases in testsuite/gcc.dg/tree-ssa/pred-comm* in the
1206 GCC testsuite.  For example, we get the first example in predcom-1.c, but
1207 miss the second one:
1208
1209 unsigned fib[1000];
1210 unsigned avg[1000];
1211
1212 __attribute__ ((noinline))
1213 void count_averages(int n) {
1214   int i;
1215   for (i = 1; i < n; i++)
1216     avg[i] = (((unsigned long) fib[i - 1] + fib[i] + fib[i + 1]) / 3) & 0xffff;
1217 }
1218
1219 which compiles into two loads instead of one in the loop.
1220
1221 predcom-2.c is the same as predcom-1.c
1222
1223 predcom-3.c is very similar but needs loads feeding each other instead of
1224 store->load.
1225
1226
1227 //===---------------------------------------------------------------------===//
1228
1229 [ALIAS ANALYSIS]
1230
1231 Type based alias analysis:
1232 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
1233
1234 We should do better analysis of posix_memalign.  At the least it should
1235 no-capture its pointer argument, at best, we should know that the out-value
1236 result doesn't point to anything (like malloc).  One example of this is in
1237 SingleSource/Benchmarks/Misc/dt.c
1238
1239 //===---------------------------------------------------------------------===//
1240
1241 A/B get pinned to the stack because we turn an if/then into a select instead
1242 of PRE'ing the load/store.  This may be fixable in instcombine:
1243 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
1244
1245 struct X { int i; };
1246 int foo (int x) {
1247   struct X a;
1248   struct X b;
1249   struct X *p;
1250   a.i = 1;
1251   b.i = 2;
1252   if (x)
1253     p = &a;
1254   else
1255     p = &b;
1256   return p->i;
1257 }
1258
1259 //===---------------------------------------------------------------------===//
1260
1261 Interesting missed case because of control flow flattening (should be 2 loads):
1262 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
1263 With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as |
1264              opt -mem2reg -gvn -instcombine | llvm-dis
1265 we miss it because we need 1) CRIT EDGE 2) MULTIPLE DIFFERENT
1266 VALS PRODUCED BY ONE BLOCK OVER DIFFERENT PATHS
1267
1268 //===---------------------------------------------------------------------===//
1269
1270 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19633
1271 We could eliminate the branch condition here, loading from null is undefined:
1272
1273 struct S { int w, x, y, z; };
1274 struct T { int r; struct S s; };
1275 void bar (struct S, int);
1276 void foo (int a, struct T b)
1277 {
1278   struct S *c = 0;
1279   if (a)
1280     c = &b.s;
1281   bar (*c, a);
1282 }
1283
1284 //===---------------------------------------------------------------------===//
1285
1286 simplifylibcalls should do several optimizations for strspn/strcspn:
1287
1288 strcspn(x, "") -> strlen(x)
1289 strcspn("", x) -> 0
1290 strspn("", x) -> 0
1291 strspn(x, "") -> strlen(x)
1292 strspn(x, "a") -> strchr(x, 'a')-x
1293
1294 strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
1295
1296 size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
1297                      int __reject3) {
1298   register size_t __result = 0;
1299   while (__s[__result] != '\0' && __s[__result] != __reject1 &&
1300          __s[__result] != __reject2 && __s[__result] != __reject3)
1301     ++__result;
1302   return __result;
1303 }
1304
1305 This should turn into a switch on the character.  See PR3253 for some notes on
1306 codegen.
1307
1308 456.hmmer apparently uses strcspn and strspn a lot.  471.omnetpp uses strspn.
1309
1310 //===---------------------------------------------------------------------===//
1311
1312 "gas" uses this idiom:
1313   else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
1314 ..
1315   else if (strchr ("<>", *intel_parser.op_string)
1316
1317 Those should be turned into a switch.
1318
1319 //===---------------------------------------------------------------------===//
1320
1321 252.eon contains this interesting code:
1322
1323         %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
1324         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1325         %strlen = call i32 @strlen(i8* %3072)    ; uses = 1
1326         %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
1327         call void @llvm.memcpy.i32(i8* %endptr,
1328           i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
1329         %3074 = call i32 @strlen(i8* %endptr) nounwind readonly
1330
1331 This is interesting for a couple reasons.  First, in this:
1332
1333         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1334         %strlen = call i32 @strlen(i8* %3072)
1335
1336 The strlen could be replaced with: %strlen = sub %3072, %3073, because the
1337 strcpy call returns a pointer to the end of the string.  Based on that, the
1338 endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
1339
1340 Second, the memcpy+strlen strlen can be replaced with:
1341
1342         %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly
1343
1344 Because the destination was just copied into the specified memory buffer.  This,
1345 in turn, can be constant folded to "4".
1346
1347 In other code, it contains:
1348
1349         %endptr6978 = bitcast i8* %endptr69 to i32*
1350         store i32 7107374, i32* %endptr6978, align 1
1351         %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly
1352
1353 Which could also be constant folded.  Whatever is producing this should probably
1354 be fixed to leave this as a memcpy from a string.
1355
1356 Further, eon also has an interesting partially redundant strlen call:
1357
1358 bb8:            ; preds = %_ZN18eonImageCalculatorC1Ev.exit
1359         %682 = getelementptr i8** %argv, i32 6          ; <i8**> [#uses=2]
1360         %683 = load i8** %682, align 4          ; <i8*> [#uses=4]
1361         %684 = load i8* %683, align 1           ; <i8> [#uses=1]
1362         %685 = icmp eq i8 %684, 0               ; <i1> [#uses=1]
1363         br i1 %685, label %bb10, label %bb9
1364
1365 bb9:            ; preds = %bb8
1366         %686 = call i32 @strlen(i8* %683) nounwind readonly
1367         %687 = icmp ugt i32 %686, 254           ; <i1> [#uses=1]
1368         br i1 %687, label %bb10, label %bb11
1369
1370 bb10:           ; preds = %bb9, %bb8
1371         %688 = call i32 @strlen(i8* %683) nounwind readonly
1372
1373 This could be eliminated by doing the strlen once in bb8, saving code size and
1374 improving perf on the bb8->9->10 path.
1375
1376 //===---------------------------------------------------------------------===//
1377
1378 I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
1379 which looks like:
1380        %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0
1381
1382
1383 bb62:           ; preds = %bb55, %bb53
1384         %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]
1385         %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1386         %172 = add i32 %171, -1         ; <i32> [#uses=1]
1387         %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172
1388
1389 ...  no stores ...
1390        br i1 %or.cond, label %bb65, label %bb72
1391
1392 bb65:           ; preds = %bb62
1393         store i8 0, i8* %173, align 1
1394         br label %bb72
1395
1396 bb72:           ; preds = %bb65, %bb62
1397         %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]
1398         %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1399
1400 Note that on the bb62->bb72 path, that the %177 strlen call is partially
1401 redundant with the %171 call.  At worst, we could shove the %177 strlen call
1402 up into the bb65 block moving it out of the bb62->bb72 path.   However, note
1403 that bb65 stores to the string, zeroing out the last byte.  This means that on
1404 that path the value of %177 is actually just %171-1.  A sub is cheaper than a
1405 strlen!
1406
1407 This pattern repeats several times, basically doing:
1408
1409   A = strlen(P);
1410   P[A-1] = 0;
1411   B = strlen(P);
1412   where it is "obvious" that B = A-1.
1413
1414 //===---------------------------------------------------------------------===//
1415
1416 186.crafty contains this interesting pattern:
1417
1418 %77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
1419                        i8* %30)
1420 %phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
1421 br i1 %phitmp648, label %bb70, label %bb76
1422
1423 bb70:           ; preds = %OptionMatch.exit91, %bb69
1424         %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
1425
1426 This is basically:
1427   cststr = "abcdef";
1428   if (strstr(cststr, P) == cststr) {
1429      x = strlen(P);
1430      ...
1431
1432 The strstr call would be significantly cheaper written as:
1433
1434 cststr = "abcdef";
1435 if (memcmp(P, str, strlen(P)))
1436   x = strlen(P);
1437
1438 This is memcmp+strlen instead of strstr.  This also makes the strlen fully
1439 redundant.
1440
1441 //===---------------------------------------------------------------------===//
1442
1443 186.crafty also contains this code:
1444
1445 %1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1446 %1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
1447 %1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
1448 %1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1449 %1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909
1450
1451 The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
1452
1453 //===---------------------------------------------------------------------===//
1454
1455 186.crafty has this interesting pattern with the "out.4543" variable:
1456
1457 call void @llvm.memcpy.i32(
1458         i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
1459        i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1)
1460 %101 = call@printf(i8* ...   @out.4543, i32 0, i32 0)) nounwind
1461
1462 It is basically doing:
1463
1464   memcpy(globalarray, "string");
1465   printf(...,  globalarray);
1466
1467 Anyway, by knowing that printf just reads the memory and forward substituting
1468 the string directly into the printf, this eliminates reads from globalarray.
1469 Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
1470 other similar functions) there are many stores to "out".  Once all the printfs
1471 stop using "out", all that is left is the memcpy's into it.  This should allow
1472 globalopt to remove the "stored only" global.
1473
1474 //===---------------------------------------------------------------------===//
1475
1476 This code:
1477
1478 define inreg i32 @foo(i8* inreg %p) nounwind {
1479   %tmp0 = load i8* %p
1480   %tmp1 = ashr i8 %tmp0, 5
1481   %tmp2 = sext i8 %tmp1 to i32
1482   ret i32 %tmp2
1483 }
1484
1485 could be dagcombine'd to a sign-extending load with a shift.
1486 For example, on x86 this currently gets this:
1487
1488         movb    (%eax), %al
1489         sarb    $5, %al
1490         movsbl  %al, %eax
1491
1492 while it could get this:
1493
1494         movsbl  (%eax), %eax
1495         sarl    $5, %eax
1496
1497 //===---------------------------------------------------------------------===//
1498
1499 GCC PR31029:
1500
1501 int test(int x) { return 1-x == x; }     // --> return false
1502 int test2(int x) { return 2-x == x; }    // --> return x == 1 ?
1503
1504 Always foldable for odd constants, what is the rule for even?
1505
1506 //===---------------------------------------------------------------------===//
1507
1508 PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
1509 for next field in struct (which is at same address).
1510
1511 For example: store of float into { {{}}, float } could be turned into a store to
1512 the float directly.
1513
1514 //===---------------------------------------------------------------------===//
1515
1516 #include <math.h>
1517 double foo(double a) {    return sin(a); }
1518
1519 This compiles into this on x86-64 Linux:
1520 foo:
1521         subq    $8, %rsp
1522         call    sin
1523         addq    $8, %rsp
1524         ret
1525 vs:
1526
1527 foo:
1528         jmp sin
1529
1530 //===---------------------------------------------------------------------===//
1531
1532 The arg promotion pass should make use of nocapture to make its alias analysis
1533 stuff much more precise.
1534
1535 //===---------------------------------------------------------------------===//
1536
1537 The following functions should be optimized to use a select instead of a
1538 branch (from gcc PR40072):
1539
1540 char char_int(int m) {if(m>7) return 0; return m;}
1541 int int_char(char m) {if(m>7) return 0; return m;}
1542
1543 //===---------------------------------------------------------------------===//
1544
1545 int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; }
1546
1547 Generates this:
1548
1549 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1550 entry:
1551   %0 = and i32 %a, 128                            ; <i32> [#uses=1]
1552   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
1553   %2 = or i32 %b, 128                             ; <i32> [#uses=1]
1554   %3 = and i32 %b, -129                           ; <i32> [#uses=1]
1555   %b_addr.0 = select i1 %1, i32 %3, i32 %2        ; <i32> [#uses=1]
1556   ret i32 %b_addr.0
1557 }
1558
1559 However, it's functionally equivalent to:
1560
1561          b = (b & ~0x80) | (a & 0x80);
1562
1563 Which generates this:
1564
1565 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1566 entry:
1567   %0 = and i32 %b, -129                           ; <i32> [#uses=1]
1568   %1 = and i32 %a, 128                            ; <i32> [#uses=1]
1569   %2 = or i32 %0, %1                              ; <i32> [#uses=1]
1570   ret i32 %2
1571 }
1572
1573 This can be generalized for other forms:
1574
1575      b = (b & ~0x80) | (a & 0x40) << 1;
1576
1577 //===---------------------------------------------------------------------===//
1578
1579 These two functions produce different code. They shouldn't:
1580
1581 #include <stdint.h>
1582
1583 uint8_t p1(uint8_t b, uint8_t a) {
1584   b = (b & ~0xc0) | (a & 0xc0);
1585   return (b);
1586 }
1587
1588 uint8_t p2(uint8_t b, uint8_t a) {
1589   b = (b & ~0x40) | (a & 0x40);
1590   b = (b & ~0x80) | (a & 0x80);
1591   return (b);
1592 }
1593
1594 define zeroext i8 @p1(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1595 entry:
1596   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1597   %1 = and i8 %a, -64                             ; <i8> [#uses=1]
1598   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1599   ret i8 %2
1600 }
1601
1602 define zeroext i8 @p2(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1603 entry:
1604   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1605   %.masked = and i8 %a, 64                        ; <i8> [#uses=1]
1606   %1 = and i8 %a, -128                            ; <i8> [#uses=1]
1607   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1608   %3 = or i8 %2, %.masked                         ; <i8> [#uses=1]
1609   ret i8 %3
1610 }
1611
1612 //===---------------------------------------------------------------------===//
1613
1614 IPSCCP does not currently propagate argument dependent constants through
1615 functions where it does not not all of the callers.  This includes functions
1616 with normal external linkage as well as templates, C99 inline functions etc.
1617 Specifically, it does nothing to:
1618
1619 define i32 @test(i32 %x, i32 %y, i32 %z) nounwind {
1620 entry:
1621   %0 = add nsw i32 %y, %z
1622   %1 = mul i32 %0, %x
1623   %2 = mul i32 %y, %z
1624   %3 = add nsw i32 %1, %2
1625   ret i32 %3
1626 }
1627
1628 define i32 @test2() nounwind {
1629 entry:
1630   %0 = call i32 @test(i32 1, i32 2, i32 4) nounwind
1631   ret i32 %0
1632 }
1633
1634 It would be interesting extend IPSCCP to be able to handle simple cases like
1635 this, where all of the arguments to a call are constant.  Because IPSCCP runs
1636 before inlining, trivial templates and inline functions are not yet inlined.
1637 The results for a function + set of constant arguments should be memoized in a
1638 map.
1639
1640 //===---------------------------------------------------------------------===//
1641
1642 The libcall constant folding stuff should be moved out of SimplifyLibcalls into
1643 libanalysis' constantfolding logic.  This would allow IPSCCP to be able to
1644 handle simple things like this:
1645
1646 static int foo(const char *X) { return strlen(X); }
1647 int bar() { return foo("abcd"); }
1648
1649 //===---------------------------------------------------------------------===//
1650
1651 InstCombine should use SimplifyDemandedBits to remove the or instruction:
1652
1653 define i1 @test(i8 %x, i8 %y) {
1654   %A = or i8 %x, 1
1655   %B = icmp ugt i8 %A, 3
1656   ret i1 %B
1657 }
1658
1659 Currently instcombine calls SimplifyDemandedBits with either all bits or just
1660 the sign bit, if the comparison is obviously a sign test. In this case, we only
1661 need all but the bottom two bits from %A, and if we gave that mask to SDB it
1662 would delete the or instruction for us.
1663
1664 //===---------------------------------------------------------------------===//
1665
1666 functionattrs doesn't know much about memcpy/memset.  This function should be
1667 marked readnone rather than readonly, since it only twiddles local memory, but
1668 functionattrs doesn't handle memset/memcpy/memmove aggressively:
1669
1670 struct X { int *p; int *q; };
1671 int foo() {
1672  int i = 0, j = 1;
1673  struct X x, y;
1674  int **p;
1675  y.p = &i;
1676  x.q = &j;
1677  p = __builtin_memcpy (&x, &y, sizeof (int *));
1678  return **p;
1679 }
1680
1681 //===---------------------------------------------------------------------===//
1682
1683 Missed instcombine transformation:
1684 define i1 @a(i32 %x) nounwind readnone {
1685 entry:
1686   %cmp = icmp eq i32 %x, 30
1687   %sub = add i32 %x, -30
1688   %cmp2 = icmp ugt i32 %sub, 9
1689   %or = or i1 %cmp, %cmp2
1690   ret i1 %or
1691 }
1692 This should be optimized to a single compare.  Testcase derived from gcc.
1693
1694 //===---------------------------------------------------------------------===//
1695
1696 Missed instcombine transformation:
1697 void b();
1698 void a(int x) { if (((1<<x)&8)==0) b(); }
1699
1700 The shift should be optimized out.  Testcase derived from gcc.
1701
1702 //===---------------------------------------------------------------------===//
1703
1704 Missed instcombine or reassociate transformation:
1705 int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
1706
1707 The sgt and slt should be combined into a single comparison. Testcase derived
1708 from gcc.
1709
1710 //===---------------------------------------------------------------------===//
1711
1712 Missed instcombine transformation:
1713 define i32 @a(i32 %x) nounwind readnone {
1714 entry:
1715   %shr = lshr i32 %x, 5                           ; <i32> [#uses=1]
1716   %xor = xor i32 %shr, 67108864                   ; <i32> [#uses=1]
1717   %sub = add i32 %xor, -67108864                  ; <i32> [#uses=1]
1718   ret i32 %sub
1719 }
1720
1721 This function is equivalent to "ashr i32 %x, 5".  Testcase derived from gcc.
1722
1723 //===---------------------------------------------------------------------===//
1724
1725 isSafeToLoadUnconditionally should allow a GEP of a global/alloca with constant
1726 indicies within the bounds of the allocated object. Reduced example:
1727
1728 const int a[] = {3,6};
1729 int b(int y) { int* x = y ? &a[0] : &a[1]; return *x; }
1730
1731 All the loads should be eliminated.  Testcase derived from gcc.
1732
1733 //===---------------------------------------------------------------------===//