lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 We should recognized various "overflow detection" idioms and translate them into
   6 llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
   7
   8 unsigned int mul(unsigned int a,unsigned int b) {
   9  if ((unsigned long long)a*b>0xffffffff)
  10    exit(0);
  11   return a*b;
  12 }
  13
  14 The legalization code for mul-with-overflow needs to be made more robust before
  15 this can be implemented though.
  16
  17 //===---------------------------------------------------------------------===//
  18
  19 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  20 precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
  21 safe in general, even on darwin.  See the libm implementation of hypot for
  22 examples (which special case when x/y are exactly zero to get signed zeros etc
  23 right).
  24
  25 //===---------------------------------------------------------------------===//
  26
  27 On targets with expensive 64-bit multiply, we could LSR this:
  28
  29 for (i = ...; ++i) {
  30    x = 1ULL << i;
  31
  32 into:
  33  long long tmp = 1;
  34  for (i = ...; ++i, tmp+=tmp)
  35    x = tmp;
  36
  37 This would be a win on ppc32, but not x86 or ppc64.
  38
  39 //===---------------------------------------------------------------------===//
  40
  41 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  42
  43 //===---------------------------------------------------------------------===//
  44
  45 Reassociate should turn things like:
  46
  47 int factorial(int X) {
  48  return X*X*X*X*X*X*X*X;
  49 }
  50
  51 into llvm.powi calls, allowing the code generator to produce balanced
  52 multiplication trees.
  53
  54 First, the intrinsic needs to be extended to support integers, and second the
  55 code generator needs to be enhanced to lower these to multiplication trees.
  56
  57 //===---------------------------------------------------------------------===//
  58
  59 Interesting? testcase for add/shift/mul reassoc:
  60
  61 int bar(int x, int y) {
  62   return x*x*x+y+x*x*x*x*x*y*y*y*y;
  63 }
  64 int foo(int z, int n) {
  65   return bar(z, n) + bar(2*z, 2*n);
  66 }
  67
  68 This is blocked on not handling X*X*X -> powi(X, 3) (see note above).  The issue
  69 is that we end up getting t = 2*X  s = t*t   and don't turn this into 4*X*X,
  70 which is the same number of multiplies and is canonical, because the 2*X has
  71 multiple uses.  Here's a simple example:
  72
  73 define i32 @test15(i32 %X1) {
  74   %B = mul i32 %X1, 47   ; X1*47
  75   %C = mul i32 %B, %B
  76   ret i32 %C
  77 }
  78
  79
  80 //===---------------------------------------------------------------------===//
  81
  82 Reassociate should handle the example in GCC PR16157:
  83
  84 extern int a0, a1, a2, a3, a4; extern int b0, b1, b2, b3, b4;
  85 void f () {  /* this can be optimized to four additions... */
  86         b4 = a4 + a3 + a2 + a1 + a0;
  87         b3 = a3 + a2 + a1 + a0;
  88         b2 = a2 + a1 + a0;
  89         b1 = a1 + a0;
  90 }
  91
  92 This requires reassociating to forms of expressions that are already available,
  93 something that reassoc doesn't think about yet.
  94
  95
  96 //===---------------------------------------------------------------------===//
  97
  98 This function: (derived from GCC PR19988)
  99 double foo(double x, double y) {
 100   return ((x + 0.1234 * y) * (x + -0.1234 * y));
 101 }
 102
 103 compiles to:
 104 _foo:
 105         movapd  %xmm1, %xmm2
 106         mulsd   LCPI1_1(%rip), %xmm1
 107         mulsd   LCPI1_0(%rip), %xmm2
 108         addsd   %xmm0, %xmm1
 109         addsd   %xmm0, %xmm2
 110         movapd  %xmm1, %xmm0
 111         mulsd   %xmm2, %xmm0
 112         ret
 113
 114 Reassociate should be able to turn it into:
 115
 116 double foo(double x, double y) {
 117   return ((x + 0.1234 * y) * (x - 0.1234 * y));
 118 }
 119
 120 Which allows the multiply by constant to be CSE'd, producing:
 121
 122 _foo:
 123         mulsd   LCPI1_0(%rip), %xmm1
 124         movapd  %xmm1, %xmm2
 125         addsd   %xmm0, %xmm2
 126         subsd   %xmm1, %xmm0
 127         mulsd   %xmm2, %xmm0
 128         ret
 129
 130 This doesn't need -ffast-math support at all.  This is particularly bad because
 131 the llvm-gcc frontend is canonicalizing the later into the former, but clang
 132 doesn't have this problem.
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 These two functions should generate the same code on big-endian systems:
 137
 138 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 139 int h(int *j, int *l) {  return *j - *l; }
 140
 141 this could be done in SelectionDAGISel.cpp, along with other special cases,
 142 for 1,2,4,8 bytes.
 143
 144 //===---------------------------------------------------------------------===//
 145
 146 It would be nice to revert this patch:
 147 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 148
 149 And teach the dag combiner enough to simplify the code expanded before
 150 legalize.  It seems plausible that this knowledge would let it simplify other
 151 stuff too.
 152
 153 //===---------------------------------------------------------------------===//
 154
 155 For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal
 156 to the type size. It works but can be overly conservative as the alignment of
 157 specific vector types are target dependent.
 158
 159 //===---------------------------------------------------------------------===//
 160
 161 We should produce an unaligned load from code like this:
 162
 163 v4sf example(float *P) {
 164   return (v4sf){P[0], P[1], P[2], P[3] };
 165 }
 166
 167 //===---------------------------------------------------------------------===//
 168
 169 Add support for conditional increments, and other related patterns.  Instead
 170 of:
 171
 172         movl 136(%esp), %eax
 173         cmpl $0, %eax
 174         je LBB16_2      #cond_next
 175 LBB16_1:        #cond_true
 176         incl _foo
 177 LBB16_2:        #cond_next
 178
 179 emit:
 180         movl    _foo, %eax
 181         cmpl    $1, %edi
 182         sbbl    $-1, %eax
 183         movl    %eax, _foo
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 188
 189 Expand these to calls of sin/cos and stores:
 190       double sincos(double x, double *sin, double *cos);
 191       float sincosf(float x, float *sin, float *cos);
 192       long double sincosl(long double x, long double *sin, long double *cos);
 193
 194 Doing so could allow SROA of the destination pointers.  See also:
 195 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 196
 197 This is now easily doable with MRVs.  We could even make an intrinsic for this
 198 if anyone cared enough about sincos.
 199
 200 //===---------------------------------------------------------------------===//
 201
 202 quantum_sigma_x in 462.libquantum contains the following loop:
 203
 204       for(i=0; i<reg->size; i++)
 205         {
 206           /* Flip the target bit of each basis state */
 207           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 208         }
 209
 210 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 211 so cool to turn it into something like:
 212
 213    long long Res = ((MAX_UNSIGNED) 1 << target);
 214    if (target < 32) {
 215      for(i=0; i<reg->size; i++)
 216        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 217    } else {
 218      for(i=0; i<reg->size; i++)
 219        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 220    }
 221
 222 ... which would only do one 32-bit XOR per loop iteration instead of two.
 223
 224 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 225 this requires TBAA.
 226
 227 //===---------------------------------------------------------------------===//
 228
 229 This isn't recognized as bswap by instcombine (yes, it really is bswap):
 230
 231 unsigned long reverse(unsigned v) {
 232     unsigned t;
 233     t = v ^ ((v << 16) | (v >> 16));
 234     t &= ~0xff0000;
 235     v = (v << 24) | (v >> 8);
 236     return v ^ (t >> 8);
 237 }
 238
 239 //===---------------------------------------------------------------------===//
 240
 241 [LOOP DELETION]
 242
 243 We don't delete this output free loop, because trip count analysis doesn't
 244 realize that it is finite (if it were infinite, it would be undefined).  Not
 245 having this blocks Loop Idiom from matching strlen and friends.
 246
 247 void foo(char *C) {
 248   int x = 0;
 249   while (*C)
 250     ++x,++C;
 251 }
 252
 253 //===---------------------------------------------------------------------===//
 254
 255 [LOOP RECOGNITION]
 256
 257 These idioms should be recognized as popcount (see PR1488):
 258
 259 unsigned countbits_slow(unsigned v) {
 260   unsigned c;
 261   for (c = 0; v; v >>= 1)
 262     c += v & 1;
 263   return c;
 264 }
 265
 266 unsigned int popcount(unsigned int input) {
 267   unsigned int count = 0;
 268   for (unsigned int i =  0; i < 4 * 8; i++)
 269     count += (input >> i) & i;
 270   return count;
 271 }
 272
 273 This should be recognized as CLZ:  rdar://8459039
 274
 275 unsigned clz_a(unsigned a) {
 276   int i;
 277   for (i=0;i<32;i++)
 278     if (a & (1<<(31-i)))
 279       return i;
 280   return 32;
 281 }
 282
 283 This sort of thing should be added to the loop idiom pass.
 284
 285 //===---------------------------------------------------------------------===//
 286
 287 These should turn into single 16-bit (unaligned?) loads on little/big endian
 288 processors.
 289
 290 unsigned short read_16_le(const unsigned char *adr) {
 291   return adr[0] | (adr[1] << 8);
 292 }
 293 unsigned short read_16_be(const unsigned char *adr) {
 294   return (adr[0] << 8) | adr[1];
 295 }
 296
 297 //===---------------------------------------------------------------------===//
 298
 299 -instcombine should handle this transform:
 300    icmp pred (sdiv X / C1 ), C2
 301 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 302
 303 Currently InstCombine avoids this transform but will do it when the signs of
 304 the operands and the sign of the divide match. See the FIXME in
 305 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 306 for Instruction::UDiv (around line 4447) for more details.
 307
 308 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 309 this construct.
 310
 311 //===---------------------------------------------------------------------===//
 312
 313 [LOOP OPTIMIZATION]
 314
 315 SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
 316 opportunities in its double_array_divs_variable function: it needs loop
 317 interchange, memory promotion (which LICM already does), vectorization and
 318 variable trip count loop unrolling (since it has a constant trip count). ICC
 319 apparently produces this very nice code with -ffast-math:
 320
 321 ..B1.70:                        # Preds ..B1.70 ..B1.69
 322        mulpd     %xmm0, %xmm1                                  #108.2
 323        mulpd     %xmm0, %xmm1                                  #108.2
 324        mulpd     %xmm0, %xmm1                                  #108.2
 325        mulpd     %xmm0, %xmm1                                  #108.2
 326        addl      $8, %edx                                      #
 327        cmpl      $131072, %edx                                 #108.2
 328        jb        ..B1.70       # Prob 99%                      #108.2
 329
 330 It would be better to count down to zero, but this is a lot better than what we
 331 do.
 332
 333 //===---------------------------------------------------------------------===//
 334
 335 Consider:
 336
 337 typedef unsigned U32;
 338 typedef unsigned long long U64;
 339 int test (U32 *inst, U64 *regs) {
 340     U64 effective_addr2;
 341     U32 temp = *inst;
 342     int r1 = (temp >> 20) & 0xf;
 343     int b2 = (temp >> 16) & 0xf;
 344     effective_addr2 = temp & 0xfff;
 345     if (b2) effective_addr2 += regs[b2];
 346     b2 = (temp >> 12) & 0xf;
 347     if (b2) effective_addr2 += regs[b2];
 348     effective_addr2 &= regs[4];
 349      if ((effective_addr2 & 3) == 0)
 350         return 1;
 351     return 0;
 352 }
 353
 354 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 355 we don't eliminate the computation of the top half of effective_addr2 because
 356 we don't have whole-function selection dags.  On x86, this means we use one
 357 extra register for the function when effective_addr2 is declared as U64 than
 358 when it is declared U32.
 359
 360 PHI Slicing could be extended to do this.
 361
 362 //===---------------------------------------------------------------------===//
 363
 364 Tail call elim should be more aggressive, checking to see if the call is
 365 followed by an uncond branch to an exit block.
 366
 367 ; This testcase is due to tail-duplication not wanting to copy the return
 368 ; instruction into the terminating blocks because there was other code
 369 ; optimized out of the function after the taildup happened.
 370 ; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
 371
 372 define i32 @t4(i32 %a) {
 373 entry:
 374         %tmp.1 = and i32 %a, 1          ; <i32> [#uses=1]
 375         %tmp.2 = icmp ne i32 %tmp.1, 0          ; <i1> [#uses=1]
 376         br i1 %tmp.2, label %then.0, label %else.0
 377
 378 then.0:         ; preds = %entry
 379         %tmp.5 = add i32 %a, -1         ; <i32> [#uses=1]
 380         %tmp.3 = call i32 @t4( i32 %tmp.5 )             ; <i32> [#uses=1]
 381         br label %return
 382
 383 else.0:         ; preds = %entry
 384         %tmp.7 = icmp ne i32 %a, 0              ; <i1> [#uses=1]
 385         br i1 %tmp.7, label %then.1, label %return
 386
 387 then.1:         ; preds = %else.0
 388         %tmp.11 = add i32 %a, -2                ; <i32> [#uses=1]
 389         %tmp.9 = call i32 @t4( i32 %tmp.11 )            ; <i32> [#uses=1]
 390         br label %return
 391
 392 return:         ; preds = %then.1, %else.0, %then.0
 393         %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
 394                             [ %tmp.9, %then.1 ]
 395         ret i32 %result.0
 396 }
 397
 398 //===---------------------------------------------------------------------===//
 399
 400 Tail recursion elimination should handle:
 401
 402 int pow2m1(int n) {
 403  if (n == 0)
 404    return 0;
 405  return 2 * pow2m1 (n - 1) + 1;
 406 }
 407
 408 Also, multiplies can be turned into SHL's, so they should be handled as if
 409 they were associative.  "return foo() << 1" can be tail recursion eliminated.
 410
 411 //===---------------------------------------------------------------------===//
 412
 413 Argument promotion should promote arguments for recursive functions, like
 414 this:
 415
 416 ; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
 417
 418 define internal i32 @foo(i32* %x) {
 419 entry:
 420         %tmp = load i32* %x             ; <i32> [#uses=0]
 421         %tmp.foo = call i32 @foo( i32* %x )             ; <i32> [#uses=1]
 422         ret i32 %tmp.foo
 423 }
 424
 425 define i32 @bar(i32* %x) {
 426 entry:
 427         %tmp3 = call i32 @foo( i32* %x )                ; <i32> [#uses=1]
 428         ret i32 %tmp3
 429 }
 430
 431 //===---------------------------------------------------------------------===//
 432
 433 We should investigate an instruction sinking pass.  Consider this silly
 434 example in pic mode:
 435
 436 #include <assert.h>
 437 void foo(int x) {
 438   assert(x);
 439   //...
 440 }
 441
 442 we compile this to:
 443 _foo:
 444         subl    $28, %esp
 445         call    "L1$pb"
 446 "L1$pb":
 447         popl    %eax
 448         cmpl    $0, 32(%esp)
 449         je      LBB1_2  # cond_true
 450 LBB1_1: # return
 451         # ...
 452         addl    $28, %esp
 453         ret
 454 LBB1_2: # cond_true
 455 ...
 456
 457 The PIC base computation (call+popl) is only used on one path through the
 458 code, but is currently always computed in the entry block.  It would be
 459 better to sink the picbase computation down into the block for the
 460 assertion, as it is the only one that uses it.  This happens for a lot of
 461 code with early outs.
 462
 463 Another example is loads of arguments, which are usually emitted into the
 464 entry block on targets like x86.  If not used in all paths through a
 465 function, they should be sunk into the ones that do.
 466
 467 In this case, whole-function-isel would also handle this.
 468
 469 //===---------------------------------------------------------------------===//
 470
 471 Investigate lowering of sparse switch statements into perfect hash tables:
 472 http://burtleburtle.net/bob/hash/perfect.html
 473
 474 //===---------------------------------------------------------------------===//
 475
 476 We should turn things like "load+fabs+store" and "load+fneg+store" into the
 477 corresponding integer operations.  On a yonah, this loop:
 478
 479 double a[256];
 480 void foo() {
 481   int i, b;
 482   for (b = 0; b < 10000000; b++)
 483   for (i = 0; i < 256; i++)
 484     a[i] = -a[i];
 485 }
 486
 487 is twice as slow as this loop:
 488
 489 long long a[256];
 490 void foo() {
 491   int i, b;
 492   for (b = 0; b < 10000000; b++)
 493   for (i = 0; i < 256; i++)
 494     a[i] ^= (1ULL << 63);
 495 }
 496
 497 and I suspect other processors are similar.  On X86 in particular this is a
 498 big win because doing this with integers allows the use of read/modify/write
 499 instructions.
 500
 501 //===---------------------------------------------------------------------===//
 502
 503 DAG Combiner should try to combine small loads into larger loads when
 504 profitable.  For example, we compile this C++ example:
 505
 506 struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
 507 extern THotKey m_HotKey;
 508 THotKey GetHotKey () { return m_HotKey; }
 509
 510 into (-m64 -O3 -fno-exceptions -static -fomit-frame-pointer):
 511
 512 __Z9GetHotKeyv:                         ## @_Z9GetHotKeyv
 513         movq    _m_HotKey@GOTPCREL(%rip), %rax
 514         movzwl  (%rax), %ecx
 515         movzbl  2(%rax), %edx
 516         shlq    $16, %rdx
 517         orq     %rcx, %rdx
 518         movzbl  3(%rax), %ecx
 519         shlq    $24, %rcx
 520         orq     %rdx, %rcx
 521         movzbl  4(%rax), %eax
 522         shlq    $32, %rax
 523         orq     %rcx, %rax
 524         ret
 525
 526 //===---------------------------------------------------------------------===//
 527
 528 We should add an FRINT node to the DAG to model targets that have legal
 529 implementations of ceil/floor/rint.
 530
 531 //===---------------------------------------------------------------------===//
 532
 533 Consider:
 534
 535 int test() {
 536   long long input[8] = {1,0,1,0,1,0,1,0};
 537   foo(input);
 538 }
 539
 540 Clang compiles this into:
 541
 542   call void @llvm.memset.p0i8.i64(i8* %tmp, i8 0, i64 64, i32 16, i1 false)
 543   %0 = getelementptr [8 x i64]* %input, i64 0, i64 0
 544   store i64 1, i64* %0, align 16
 545   %1 = getelementptr [8 x i64]* %input, i64 0, i64 2
 546   store i64 1, i64* %1, align 16
 547   %2 = getelementptr [8 x i64]* %input, i64 0, i64 4
 548   store i64 1, i64* %2, align 16
 549   %3 = getelementptr [8 x i64]* %input, i64 0, i64 6
 550   store i64 1, i64* %3, align 16
 551
 552 Which gets codegen'd into:
 553
 554         pxor    %xmm0, %xmm0
 555         movaps  %xmm0, -16(%rbp)
 556         movaps  %xmm0, -32(%rbp)
 557         movaps  %xmm0, -48(%rbp)
 558         movaps  %xmm0, -64(%rbp)
 559         movq    $1, -64(%rbp)
 560         movq    $1, -48(%rbp)
 561         movq    $1, -32(%rbp)
 562         movq    $1, -16(%rbp)
 563
 564 It would be better to have 4 movq's of 0 instead of the movaps's.
 565
 566 //===---------------------------------------------------------------------===//
 567
 568 http://llvm.org/PR717:
 569
 570 The following code should compile into "ret int undef". Instead, LLVM
 571 produces "ret int 0":
 572
 573 int f() {
 574   int x = 4;
 575   int y;
 576   if (x == 3) y = 0;
 577   return y;
 578 }
 579
 580 //===---------------------------------------------------------------------===//
 581
 582 The loop unroller should partially unroll loops (instead of peeling them)
 583 when code growth isn't too bad and when an unroll count allows simplification
 584 of some code within the loop.  One trivial example is:
 585
 586 #include <stdio.h>
 587 int main() {
 588     int nRet = 17;
 589     int nLoop;
 590     for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
 591         if ( nLoop & 1 )
 592             nRet += 2;
 593         else
 594             nRet -= 1;
 595     }
 596     return nRet;
 597 }
 598
 599 Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
 600 reduction in code size.  The resultant code would then also be suitable for
 601 exit value computation.
 602
 603 //===---------------------------------------------------------------------===//
 604
 605 We miss a bunch of rotate opportunities on various targets, including ppc, x86,
 606 etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
 607 matching code in dag combine doesn't look through truncates aggressively
 608 enough.  Here are some testcases reduces from GCC PR17886:
 609
 610 unsigned long long f5(unsigned long long x, unsigned long long y) {
 611   return (x << 8) | ((y >> 48) & 0xffull);
 612 }
 613 unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
 614   switch(z) {
 615   case 1:
 616     return (x << 8) | ((y >> 48) & 0xffull);
 617   case 2:
 618     return (x << 16) | ((y >> 40) & 0xffffull);
 619   case 3:
 620     return (x << 24) | ((y >> 32) & 0xffffffull);
 621   case 4:
 622     return (x << 32) | ((y >> 24) & 0xffffffffull);
 623   default:
 624     return (x << 40) | ((y >> 16) & 0xffffffffffull);
 625   }
 626 }
 627
 628 //===---------------------------------------------------------------------===//
 629
 630 This (and similar related idioms):
 631
 632 unsigned int foo(unsigned char i) {
 633   return i | (i<<8) | (i<<16) | (i<<24);
 634 }
 635
 636 compiles into:
 637
 638 define i32 @foo(i8 zeroext %i) nounwind readnone ssp noredzone {
 639 entry:
 640   %conv = zext i8 %i to i32
 641   %shl = shl i32 %conv, 8
 642   %shl5 = shl i32 %conv, 16
 643   %shl9 = shl i32 %conv, 24
 644   %or = or i32 %shl9, %conv
 645   %or6 = or i32 %or, %shl5
 646   %or10 = or i32 %or6, %shl
 647   ret i32 %or10
 648 }
 649
 650 it would be better as:
 651
 652 unsigned int bar(unsigned char i) {
 653   unsigned int j=i | (i << 8);
 654   return j | (j<<16);
 655 }
 656
 657 aka:
 658
 659 define i32 @bar(i8 zeroext %i) nounwind readnone ssp noredzone {
 660 entry:
 661   %conv = zext i8 %i to i32
 662   %shl = shl i32 %conv, 8
 663   %or = or i32 %shl, %conv
 664   %shl5 = shl i32 %or, 16
 665   %or6 = or i32 %shl5, %or
 666   ret i32 %or6
 667 }
 668
 669 or even i*0x01010101, depending on the speed of the multiplier.  The best way to
 670 handle this is to canonicalize it to a multiply in IR and have codegen handle
 671 lowering multiplies to shifts on cpus where shifts are faster.
 672
 673 //===---------------------------------------------------------------------===//
 674
 675 We do a number of simplifications in simplify libcalls to strength reduce
 676 standard library functions, but we don't currently merge them together.  For
 677 example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
 678 be done safely if "b" isn't modified between the strlen and memcpy of course.
 679
 680 //===---------------------------------------------------------------------===//
 681
 682 We compile this program: (from GCC PR11680)
 683 http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
 684
 685 Into code that runs the same speed in fast/slow modes, but both modes run 2x
 686 slower than when compile with GCC (either 4.0 or 4.2):
 687
 688 $ llvm-g++ perf.cpp -O3 -fno-exceptions
 689 $ time ./a.out fast
 690 1.821u 0.003s 0:01.82 100.0%    0+0k 0+0io 0pf+0w
 691
 692 $ g++ perf.cpp -O3 -fno-exceptions
 693 $ time ./a.out fast
 694 0.821u 0.001s 0:00.82 100.0%    0+0k 0+0io 0pf+0w
 695
 696 It looks like we are making the same inlining decisions, so this may be raw
 697 codegen badness or something else (haven't investigated).
 698
 699 //===---------------------------------------------------------------------===//
 700
 701 Divisibility by constant can be simplified (according to GCC PR12849) from
 702 being a mulhi to being a mul lo (cheaper).  Testcase:
 703
 704 void bar(unsigned n) {
 705   if (n % 3 == 0)
 706     true();
 707 }
 708
 709 This is equivalent to the following, where 2863311531 is the multiplicative
 710 inverse of 3, and 1431655766 is ((2^32)-1)/3+1:
 711 void bar(unsigned n) {
 712   if (n * 2863311531U < 1431655766U)
 713     true();
 714 }
 715
 716 The same transformation can work with an even modulo with the addition of a
 717 rotate: rotate the result of the multiply to the right by the number of bits
 718 which need to be zero for the condition to be true, and shrink the compare RHS
 719 by the same amount.  Unless the target supports rotates, though, that
 720 transformation probably isn't worthwhile.
 721
 722 The transformation can also easily be made to work with non-zero equality
 723 comparisons: just transform, for example, "n % 3 == 1" to "(n-1) % 3 == 0".
 724
 725 //===---------------------------------------------------------------------===//
 726
 727 Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
 728 bunch of other stuff from this example (see PR1604):
 729
 730 #include <cstdio>
 731 struct test {
 732     int val;
 733     virtual ~test() {}
 734 };
 735
 736 int main() {
 737     test t;
 738     std::scanf("%d", &t.val);
 739     std::printf("%d\n", t.val);
 740 }
 741
 742 //===---------------------------------------------------------------------===//
 743
 744 These functions perform the same computation, but produce different assembly.
 745
 746 define i8 @select(i8 %x) readnone nounwind {
 747   %A = icmp ult i8 %x, 250
 748   %B = select i1 %A, i8 0, i8 1
 749   ret i8 %B
 750 }
 751
 752 define i8 @addshr(i8 %x) readnone nounwind {
 753   %A = zext i8 %x to i9
 754   %B = add i9 %A, 6       ;; 256 - 250 == 6
 755   %C = lshr i9 %B, 8
 756   %D = trunc i9 %C to i8
 757   ret i8 %D
 758 }
 759
 760 //===---------------------------------------------------------------------===//
 761
 762 From gcc bug 24696:
 763 int
 764 f (unsigned long a, unsigned long b, unsigned long c)
 765 {
 766   return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0);
 767 }
 768 int
 769 f (unsigned long a, unsigned long b, unsigned long c)
 770 {
 771   return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
 772 }
 773 Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
 774 "clang -emit-llvm-bc | opt -std-compile-opts".
 775
 776 //===---------------------------------------------------------------------===//
 777
 778 From GCC Bug 20192:
 779 #define PMD_MASK    (~((1UL << 23) - 1))
 780 void clear_pmd_range(unsigned long start, unsigned long end)
 781 {
 782    if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK))
 783        f();
 784 }
 785 The expression should optimize to something like
 786 "!((start|end)&~PMD_MASK). Currently not optimized with "clang
 787 -emit-llvm-bc | opt -std-compile-opts".
 788
 789 //===---------------------------------------------------------------------===//
 790
 791 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
 792 i;}
 793 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 794 These should combine to the same thing.  Currently, the first function
 795 produces better code on X86.
 796
 797 //===---------------------------------------------------------------------===//
 798
 799 From GCC Bug 15784:
 800 #define abs(x) x>0?x:-x
 801 int f(int x, int y)
 802 {
 803  return (abs(x)) >= 0;
 804 }
 805 This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
 806 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 807
 808 //===---------------------------------------------------------------------===//
 809
 810 From GCC Bug 14753:
 811 void
 812 rotate_cst (unsigned int a)
 813 {
 814  a = (a << 10) | (a >> 22);
 815  if (a == 123)
 816    bar ();
 817 }
 818 void
 819 minus_cst (unsigned int a)
 820 {
 821  unsigned int tem;
 822
 823  tem = 20 - a;
 824  if (tem == 5)
 825    bar ();
 826 }
 827 void
 828 mask_gt (unsigned int a)
 829 {
 830  /* This is equivalent to a > 15.  */
 831  if ((a & ~7) > 8)
 832    bar ();
 833 }
 834 void
 835 rshift_gt (unsigned int a)
 836 {
 837  /* This is equivalent to a > 23.  */
 838  if ((a >> 2) > 5)
 839    bar ();
 840 }
 841
 842 All should simplify to a single comparison.  All of these are
 843 currently not optimized with "clang -emit-llvm-bc | opt
 844 -std-compile-opts".
 845
 846 //===---------------------------------------------------------------------===//
 847
 848 From GCC Bug 32605:
 849 int c(int* x) {return (char*)x+2 == (char*)x;}
 850 Should combine to 0.  Currently not optimized with "clang
 851 -emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
 852
 853 //===---------------------------------------------------------------------===//
 854
 855 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
 856 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
 857 with "clang -emit-llvm-bc | opt -std-compile-opts".
 858
 859 //===---------------------------------------------------------------------===//
 860
 861 unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
 862 Should combine to "x | (y & 3)".  Currently not optimized with "clang
 863 -emit-llvm-bc | opt -std-compile-opts".
 864
 865 //===---------------------------------------------------------------------===//
 866
 867 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
 868 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
 869 "clang -emit-llvm-bc | opt -std-compile-opts".
 870
 871 //===---------------------------------------------------------------------===//
 872
 873 int a(int a,int b) {return (~(a|b))|a;}
 874 Should fold to "a|~b".  Currently not optimized with "clang
 875 -emit-llvm-bc | opt -std-compile-opts".
 876
 877 //===---------------------------------------------------------------------===//
 878
 879 int a(int a, int b) {return (a&&b) || (a&&!b);}
 880 Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
 881 | opt -std-compile-opts".
 882
 883 //===---------------------------------------------------------------------===//
 884
 885 int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
 886 Should fold to "a ? b : c", or at least something sane.  Currently not
 887 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 888
 889 //===---------------------------------------------------------------------===//
 890
 891 int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
 892 Should fold to a && (b || c).  Currently not optimized with "clang
 893 -emit-llvm-bc | opt -std-compile-opts".
 894
 895 //===---------------------------------------------------------------------===//
 896
 897 int a(int x) {return x | ((x & 8) ^ 8);}
 898 Should combine to x | 8.  Currently not optimized with "clang
 899 -emit-llvm-bc | opt -std-compile-opts".
 900
 901 //===---------------------------------------------------------------------===//
 902
 903 int a(int x) {return x ^ ((x & 8) ^ 8);}
 904 Should also combine to x | 8.  Currently not optimized with "clang
 905 -emit-llvm-bc | opt -std-compile-opts".
 906
 907 //===---------------------------------------------------------------------===//
 908
 909 int a(int x) {return ((x | -9) ^ 8) & x;}
 910 Should combine to x & -9.  Currently not optimized with "clang
 911 -emit-llvm-bc | opt -std-compile-opts".
 912
 913 //===---------------------------------------------------------------------===//
 914
 915 unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
 916 Should combine to "a * 0x88888888 >> 31".  Currently not optimized
 917 with "clang -emit-llvm-bc | opt -std-compile-opts".
 918
 919 //===---------------------------------------------------------------------===//
 920
 921 unsigned a(char* x) {if ((*x & 32) == 0) return b();}
 922 There's an unnecessary zext in the generated code with "clang
 923 -emit-llvm-bc | opt -std-compile-opts".
 924
 925 //===---------------------------------------------------------------------===//
 926
 927 unsigned a(unsigned long long x) {return 40 * (x >> 1);}
 928 Should combine to "20 * (((unsigned)x) & -2)".  Currently not
 929 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 930
 931 //===---------------------------------------------------------------------===//
 932
 933 int f(int i, int j) { return i < j + 1; }
 934 int g(int i, int j) { return j > i - 1; }
 935 Should combine to "i <= j" (the add/sub has nsw).  Currently not
 936 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 937
 938 //===---------------------------------------------------------------------===//
 939
 940 unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
 941 The & 15 part should be optimized away, it doesn't change the result. Currently
 942 not optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 943
 944 //===---------------------------------------------------------------------===//
 945
 946 This was noticed in the entryblock for grokdeclarator in 403.gcc:
 947
 948         %tmp = icmp eq i32 %decl_context, 4
 949         %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context
 950         %tmp1 = icmp eq i32 %decl_context_addr.0, 1
 951         %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0
 952
 953 tmp1 should be simplified to something like:
 954   (!tmp || decl_context == 1)
 955
 956 This allows recursive simplifications, tmp1 is used all over the place in
 957 the function, e.g. by:
 958
 959         %tmp23 = icmp eq i32 %decl_context_addr.1, 0            ; <i1> [#uses=1]
 960         %tmp24 = xor i1 %tmp1, true             ; <i1> [#uses=1]
 961         %or.cond8 = and i1 %tmp23, %tmp24               ; <i1> [#uses=1]
 962
 963 later.
 964
 965 //===---------------------------------------------------------------------===//
 966
 967 [STORE SINKING]
 968
 969 Store sinking: This code:
 970
 971 void f (int n, int *cond, int *res) {
 972     int i;
 973     *res = 0;
 974     for (i = 0; i < n; i++)
 975         if (*cond)
 976             *res ^= 234; /* (*) */
 977 }
 978
 979 On this function GVN hoists the fully redundant value of *res, but nothing
 980 moves the store out.  This gives us this code:
 981
 982 bb:             ; preds = %bb2, %entry
 983         %.rle = phi i32 [ 0, %entry ], [ %.rle6, %bb2 ]
 984         %i.05 = phi i32 [ 0, %entry ], [ %indvar.next, %bb2 ]
 985         %1 = load i32* %cond, align 4
 986         %2 = icmp eq i32 %1, 0
 987         br i1 %2, label %bb2, label %bb1
 988
 989 bb1:            ; preds = %bb
 990         %3 = xor i32 %.rle, 234
 991         store i32 %3, i32* %res, align 4
 992         br label %bb2
 993
 994 bb2:            ; preds = %bb, %bb1
 995         %.rle6 = phi i32 [ %3, %bb1 ], [ %.rle, %bb ]
 996         %indvar.next = add i32 %i.05, 1
 997         %exitcond = icmp eq i32 %indvar.next, %n
 998         br i1 %exitcond, label %return, label %bb
 999
1000 DSE should sink partially dead stores to get the store out of the loop.
1001
1002 Here's another partial dead case:
1003 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12395
1004
1005 //===---------------------------------------------------------------------===//
1006
1007 Scalar PRE hoists the mul in the common block up to the else:
1008
1009 int test (int a, int b, int c, int g) {
1010   int d, e;
1011   if (a)
1012     d = b * c;
1013   else
1014     d = b - c;
1015   e = b * c + g;
1016   return d + e;
1017 }
1018
1019 It would be better to do the mul once to reduce codesize above the if.
1020 This is GCC PR38204.
1021
1022
1023 //===---------------------------------------------------------------------===//
1024 This simple function from 179.art:
1025
1026 int winner, numf2s;
1027 struct { double y; int   reset; } *Y;
1028
1029 void find_match() {
1030    int i;
1031    winner = 0;
1032    for (i=0;i<numf2s;i++)
1033        if (Y[i].y > Y[winner].y)
1034               winner =i;
1035 }
1036
1037 Compiles into (with clang TBAA):
1038
1039 for.body:                                         ; preds = %for.inc, %bb.nph
1040   %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.inc ]
1041   %i.01718 = phi i32 [ 0, %bb.nph ], [ %i.01719, %for.inc ]
1042   %tmp4 = getelementptr inbounds %struct.anon* %tmp3, i64 %indvar, i32 0
1043   %tmp5 = load double* %tmp4, align 8, !tbaa !4
1044   %idxprom7 = sext i32 %i.01718 to i64
1045   %tmp10 = getelementptr inbounds %struct.anon* %tmp3, i64 %idxprom7, i32 0
1046   %tmp11 = load double* %tmp10, align 8, !tbaa !4
1047   %cmp12 = fcmp ogt double %tmp5, %tmp11
1048   br i1 %cmp12, label %if.then, label %for.inc
1049
1050 if.then:                                          ; preds = %for.body
1051   %i.017 = trunc i64 %indvar to i32
1052   br label %for.inc
1053
1054 for.inc:                                          ; preds = %for.body, %if.then
1055   %i.01719 = phi i32 [ %i.01718, %for.body ], [ %i.017, %if.then ]
1056   %indvar.next = add i64 %indvar, 1
1057   %exitcond = icmp eq i64 %indvar.next, %tmp22
1058   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
1059
1060
1061 It is good that we hoisted the reloads of numf2's, and Y out of the loop and
1062 sunk the store to winner out.
1063
1064 However, this is awful on several levels: the conditional truncate in the loop
1065 (-indvars at fault? why can't we completely promote the IV to i64?).
1066
1067 Beyond that, we have a partially redundant load in the loop: if "winner" (aka
1068 %i.01718) isn't updated, we reload Y[winner].y the next time through the loop.
1069 Similarly, the addressing that feeds it (including the sext) is redundant. In
1070 the end we get this generated assembly:
1071
1072 LBB0_2:                                 ## %for.body
1073                                         ## =>This Inner Loop Header: Depth=1
1074         movsd   (%rdi), %xmm0
1075         movslq  %edx, %r8
1076         shlq    $4, %r8
1077         ucomisd (%rcx,%r8), %xmm0
1078         jbe     LBB0_4
1079         movl    %esi, %edx
1080 LBB0_4:                                 ## %for.inc
1081         addq    $16, %rdi
1082         incq    %rsi
1083         cmpq    %rsi, %rax
1084         jne     LBB0_2
1085
1086 All things considered this isn't too bad, but we shouldn't need the movslq or
1087 the shlq instruction, or the load folded into ucomisd every time through the
1088 loop.
1089
1090 On an x86-specific topic, if the loop can't be restructure, the movl should be a
1091 cmov.
1092
1093 //===---------------------------------------------------------------------===//
1094
1095 [STORE SINKING]
1096
1097 GCC PR37810 is an interesting case where we should sink load/store reload
1098 into the if block and outside the loop, so we don't reload/store it on the
1099 non-call path.
1100
1101 for () {
1102   *P += 1;
1103   if ()
1104     call();
1105   else
1106     ...
1107 ->
1108 tmp = *P
1109 for () {
1110   tmp += 1;
1111   if () {
1112     *P = tmp;
1113     call();
1114     tmp = *P;
1115   } else ...
1116 }
1117 *P = tmp;
1118
1119 We now hoist the reload after the call (Transforms/GVN/lpre-call-wrap.ll), but
1120 we don't sink the store.  We need partially dead store sinking.
1121
1122 //===---------------------------------------------------------------------===//
1123
1124 [LOAD PRE CRIT EDGE SPLITTING]
1125
1126 GCC PR37166: Sinking of loads prevents SROA'ing the "g" struct on the stack
1127 leading to excess stack traffic. This could be handled by GVN with some crazy
1128 symbolic phi translation.  The code we get looks like (g is on the stack):
1129
1130 bb2:            ; preds = %bb1
1131 ..
1132         %9 = getelementptr %struct.f* %g, i32 0, i32 0
1133         store i32 %8, i32* %9, align  bel %bb3
1134
1135 bb3:            ; preds = %bb1, %bb2, %bb
1136         %c_addr.0 = phi %struct.f* [ %g, %bb2 ], [ %c, %bb ], [ %c, %bb1 ]
1137         %b_addr.0 = phi %struct.f* [ %b, %bb2 ], [ %g, %bb ], [ %b, %bb1 ]
1138         %10 = getelementptr %struct.f* %c_addr.0, i32 0, i32 0
1139         %11 = load i32* %10, align 4
1140
1141 %11 is partially redundant, an in BB2 it should have the value %8.
1142
1143 GCC PR33344 and PR35287 are similar cases.
1144
1145
1146 //===---------------------------------------------------------------------===//
1147
1148 [LOAD PRE]
1149
1150 There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
1151 GCC testsuite, ones we don't get yet are (checked through loadpre25):
1152
1153 [CRIT EDGE BREAKING]
1154 loadpre3.c predcom-4.c
1155
1156 [PRE OF READONLY CALL]
1157 loadpre5.c
1158
1159 [TURN SELECT INTO BRANCH]
1160 loadpre14.c loadpre15.c
1161
1162 actually a conditional increment: loadpre18.c loadpre19.c
1163
1164 //===---------------------------------------------------------------------===//
1165
1166 [LOAD PRE / STORE SINKING / SPEC HACK]
1167
1168 This is a chunk of code from 456.hmmer:
1169
1170 int f(int M, int *mc, int *mpp, int *tpmm, int *ip, int *tpim, int *dpp,
1171      int *tpdm, int xmb, int *bp, int *ms) {
1172  int k, sc;
1173  for (k = 1; k <= M; k++) {
1174      mc[k] = mpp[k-1]   + tpmm[k-1];
1175      if ((sc = ip[k-1]  + tpim[k-1]) > mc[k])  mc[k] = sc;
1176      if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k])  mc[k] = sc;
1177      if ((sc = xmb  + bp[k])         > mc[k])  mc[k] = sc;
1178      mc[k] += ms[k];
1179    }
1180 }
1181
1182 It is very profitable for this benchmark to turn the conditional stores to mc[k]
1183 into a conditional move (select instr in IR) and allow the final store to do the
1184 store.  See GCC PR27313 for more details.  Note that this is valid to xform even
1185 with the new C++ memory model, since mc[k] is previously loaded and later
1186 stored.
1187
1188 //===---------------------------------------------------------------------===//
1189
1190 [SCALAR PRE]
1191 There are many PRE testcases in testsuite/gcc.dg/tree-ssa/ssa-pre-*.c in the
1192 GCC testsuite.
1193
1194 //===---------------------------------------------------------------------===//
1195
1196 There are some interesting cases in testsuite/gcc.dg/tree-ssa/pred-comm* in the
1197 GCC testsuite.  For example, we get the first example in predcom-1.c, but
1198 miss the second one:
1199
1200 unsigned fib[1000];
1201 unsigned avg[1000];
1202
1203 __attribute__ ((noinline))
1204 void count_averages(int n) {
1205   int i;
1206   for (i = 1; i < n; i++)
1207     avg[i] = (((unsigned long) fib[i - 1] + fib[i] + fib[i + 1]) / 3) & 0xffff;
1208 }
1209
1210 which compiles into two loads instead of one in the loop.
1211
1212 predcom-2.c is the same as predcom-1.c
1213
1214 predcom-3.c is very similar but needs loads feeding each other instead of
1215 store->load.
1216
1217
1218 //===---------------------------------------------------------------------===//
1219
1220 [ALIAS ANALYSIS]
1221
1222 Type based alias analysis:
1223 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
1224
1225 We should do better analysis of posix_memalign.  At the least it should
1226 no-capture its pointer argument, at best, we should know that the out-value
1227 result doesn't point to anything (like malloc).  One example of this is in
1228 SingleSource/Benchmarks/Misc/dt.c
1229
1230 //===---------------------------------------------------------------------===//
1231
1232 Interesting missed case because of control flow flattening (should be 2 loads):
1233 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
1234 With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as |
1235              opt -mem2reg -gvn -instcombine | llvm-dis
1236 we miss it because we need 1) CRIT EDGE 2) MULTIPLE DIFFERENT
1237 VALS PRODUCED BY ONE BLOCK OVER DIFFERENT PATHS
1238
1239 //===---------------------------------------------------------------------===//
1240
1241 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19633
1242 We could eliminate the branch condition here, loading from null is undefined:
1243
1244 struct S { int w, x, y, z; };
1245 struct T { int r; struct S s; };
1246 void bar (struct S, int);
1247 void foo (int a, struct T b)
1248 {
1249   struct S *c = 0;
1250   if (a)
1251     c = &b.s;
1252   bar (*c, a);
1253 }
1254
1255 //===---------------------------------------------------------------------===//
1256
1257 simplifylibcalls should do several optimizations for strspn/strcspn:
1258
1259 strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
1260
1261 size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
1262                      int __reject3) {
1263   register size_t __result = 0;
1264   while (__s[__result] != '\0' && __s[__result] != __reject1 &&
1265          __s[__result] != __reject2 && __s[__result] != __reject3)
1266     ++__result;
1267   return __result;
1268 }
1269
1270 This should turn into a switch on the character.  See PR3253 for some notes on
1271 codegen.
1272
1273 456.hmmer apparently uses strcspn and strspn a lot.  471.omnetpp uses strspn.
1274
1275 //===---------------------------------------------------------------------===//
1276
1277 simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917)
1278
1279 char buf1[6], buf2[6], buf3[4], buf4[4];
1280 int i;
1281
1282 int foo (void) {
1283   int ret = snprintf (buf1, sizeof buf1, "abcde");
1284   ret += snprintf (buf2, sizeof buf2, "abcdef") * 16;
1285   ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256;
1286   ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096;
1287   return ret;
1288 }
1289
1290 //===---------------------------------------------------------------------===//
1291
1292 "gas" uses this idiom:
1293   else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
1294 ..
1295   else if (strchr ("<>", *intel_parser.op_string)
1296
1297 Those should be turned into a switch.
1298
1299 //===---------------------------------------------------------------------===//
1300
1301 252.eon contains this interesting code:
1302
1303         %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
1304         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1305         %strlen = call i32 @strlen(i8* %3072)    ; uses = 1
1306         %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
1307         call void @llvm.memcpy.i32(i8* %endptr,
1308           i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
1309         %3074 = call i32 @strlen(i8* %endptr) nounwind readonly
1310
1311 This is interesting for a couple reasons.  First, in this:
1312
1313 The memcpy+strlen strlen can be replaced with:
1314
1315         %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly
1316
1317 Because the destination was just copied into the specified memory buffer.  This,
1318 in turn, can be constant folded to "4".
1319
1320 In other code, it contains:
1321
1322         %endptr6978 = bitcast i8* %endptr69 to i32*
1323         store i32 7107374, i32* %endptr6978, align 1
1324         %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly
1325
1326 Which could also be constant folded.  Whatever is producing this should probably
1327 be fixed to leave this as a memcpy from a string.
1328
1329 Further, eon also has an interesting partially redundant strlen call:
1330
1331 bb8:            ; preds = %_ZN18eonImageCalculatorC1Ev.exit
1332         %682 = getelementptr i8** %argv, i32 6          ; <i8**> [#uses=2]
1333         %683 = load i8** %682, align 4          ; <i8*> [#uses=4]
1334         %684 = load i8* %683, align 1           ; <i8> [#uses=1]
1335         %685 = icmp eq i8 %684, 0               ; <i1> [#uses=1]
1336         br i1 %685, label %bb10, label %bb9
1337
1338 bb9:            ; preds = %bb8
1339         %686 = call i32 @strlen(i8* %683) nounwind readonly
1340         %687 = icmp ugt i32 %686, 254           ; <i1> [#uses=1]
1341         br i1 %687, label %bb10, label %bb11
1342
1343 bb10:           ; preds = %bb9, %bb8
1344         %688 = call i32 @strlen(i8* %683) nounwind readonly
1345
1346 This could be eliminated by doing the strlen once in bb8, saving code size and
1347 improving perf on the bb8->9->10 path.
1348
1349 //===---------------------------------------------------------------------===//
1350
1351 I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
1352 which looks like:
1353        %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0
1354
1355
1356 bb62:           ; preds = %bb55, %bb53
1357         %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]
1358         %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1359         %172 = add i32 %171, -1         ; <i32> [#uses=1]
1360         %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172
1361
1362 ...  no stores ...
1363        br i1 %or.cond, label %bb65, label %bb72
1364
1365 bb65:           ; preds = %bb62
1366         store i8 0, i8* %173, align 1
1367         br label %bb72
1368
1369 bb72:           ; preds = %bb65, %bb62
1370         %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]
1371         %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1372
1373 Note that on the bb62->bb72 path, that the %177 strlen call is partially
1374 redundant with the %171 call.  At worst, we could shove the %177 strlen call
1375 up into the bb65 block moving it out of the bb62->bb72 path.   However, note
1376 that bb65 stores to the string, zeroing out the last byte.  This means that on
1377 that path the value of %177 is actually just %171-1.  A sub is cheaper than a
1378 strlen!
1379
1380 This pattern repeats several times, basically doing:
1381
1382   A = strlen(P);
1383   P[A-1] = 0;
1384   B = strlen(P);
1385   where it is "obvious" that B = A-1.
1386
1387 //===---------------------------------------------------------------------===//
1388
1389 186.crafty has this interesting pattern with the "out.4543" variable:
1390
1391 call void @llvm.memcpy.i32(
1392         i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
1393        i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1)
1394 %101 = call@printf(i8* ...   @out.4543, i32 0, i32 0)) nounwind
1395
1396 It is basically doing:
1397
1398   memcpy(globalarray, "string");
1399   printf(...,  globalarray);
1400
1401 Anyway, by knowing that printf just reads the memory and forward substituting
1402 the string directly into the printf, this eliminates reads from globalarray.
1403 Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
1404 other similar functions) there are many stores to "out".  Once all the printfs
1405 stop using "out", all that is left is the memcpy's into it.  This should allow
1406 globalopt to remove the "stored only" global.
1407
1408 //===---------------------------------------------------------------------===//
1409
1410 This code:
1411
1412 define inreg i32 @foo(i8* inreg %p) nounwind {
1413   %tmp0 = load i8* %p
1414   %tmp1 = ashr i8 %tmp0, 5
1415   %tmp2 = sext i8 %tmp1 to i32
1416   ret i32 %tmp2
1417 }
1418
1419 could be dagcombine'd to a sign-extending load with a shift.
1420 For example, on x86 this currently gets this:
1421
1422         movb    (%eax), %al
1423         sarb    $5, %al
1424         movsbl  %al, %eax
1425
1426 while it could get this:
1427
1428         movsbl  (%eax), %eax
1429         sarl    $5, %eax
1430
1431 //===---------------------------------------------------------------------===//
1432
1433 GCC PR31029:
1434
1435 int test(int x) { return 1-x == x; }     // --> return false
1436 int test2(int x) { return 2-x == x; }    // --> return x == 1 ?
1437
1438 Always foldable for odd constants, what is the rule for even?
1439
1440 //===---------------------------------------------------------------------===//
1441
1442 PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
1443 for next field in struct (which is at same address).
1444
1445 For example: store of float into { {{}}, float } could be turned into a store to
1446 the float directly.
1447
1448 //===---------------------------------------------------------------------===//
1449
1450 The arg promotion pass should make use of nocapture to make its alias analysis
1451 stuff much more precise.
1452
1453 //===---------------------------------------------------------------------===//
1454
1455 The following functions should be optimized to use a select instead of a
1456 branch (from gcc PR40072):
1457
1458 char char_int(int m) {if(m>7) return 0; return m;}
1459 int int_char(char m) {if(m>7) return 0; return m;}
1460
1461 //===---------------------------------------------------------------------===//
1462
1463 int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; }
1464
1465 Generates this:
1466
1467 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1468 entry:
1469   %0 = and i32 %a, 128                            ; <i32> [#uses=1]
1470   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
1471   %2 = or i32 %b, 128                             ; <i32> [#uses=1]
1472   %3 = and i32 %b, -129                           ; <i32> [#uses=1]
1473   %b_addr.0 = select i1 %1, i32 %3, i32 %2        ; <i32> [#uses=1]
1474   ret i32 %b_addr.0
1475 }
1476
1477 However, it's functionally equivalent to:
1478
1479          b = (b & ~0x80) | (a & 0x80);
1480
1481 Which generates this:
1482
1483 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1484 entry:
1485   %0 = and i32 %b, -129                           ; <i32> [#uses=1]
1486   %1 = and i32 %a, 128                            ; <i32> [#uses=1]
1487   %2 = or i32 %0, %1                              ; <i32> [#uses=1]
1488   ret i32 %2
1489 }
1490
1491 This can be generalized for other forms:
1492
1493      b = (b & ~0x80) | (a & 0x40) << 1;
1494
1495 //===---------------------------------------------------------------------===//
1496
1497 These two functions produce different code. They shouldn't:
1498
1499 #include <stdint.h>
1500
1501 uint8_t p1(uint8_t b, uint8_t a) {
1502   b = (b & ~0xc0) | (a & 0xc0);
1503   return (b);
1504 }
1505
1506 uint8_t p2(uint8_t b, uint8_t a) {
1507   b = (b & ~0x40) | (a & 0x40);
1508   b = (b & ~0x80) | (a & 0x80);
1509   return (b);
1510 }
1511
1512 define zeroext i8 @p1(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1513 entry:
1514   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1515   %1 = and i8 %a, -64                             ; <i8> [#uses=1]
1516   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1517   ret i8 %2
1518 }
1519
1520 define zeroext i8 @p2(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1521 entry:
1522   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1523   %.masked = and i8 %a, 64                        ; <i8> [#uses=1]
1524   %1 = and i8 %a, -128                            ; <i8> [#uses=1]
1525   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1526   %3 = or i8 %2, %.masked                         ; <i8> [#uses=1]
1527   ret i8 %3
1528 }
1529
1530 //===---------------------------------------------------------------------===//
1531
1532 IPSCCP does not currently propagate argument dependent constants through
1533 functions where it does not not all of the callers.  This includes functions
1534 with normal external linkage as well as templates, C99 inline functions etc.
1535 Specifically, it does nothing to:
1536
1537 define i32 @test(i32 %x, i32 %y, i32 %z) nounwind {
1538 entry:
1539   %0 = add nsw i32 %y, %z
1540   %1 = mul i32 %0, %x
1541   %2 = mul i32 %y, %z
1542   %3 = add nsw i32 %1, %2
1543   ret i32 %3
1544 }
1545
1546 define i32 @test2() nounwind {
1547 entry:
1548   %0 = call i32 @test(i32 1, i32 2, i32 4) nounwind
1549   ret i32 %0
1550 }
1551
1552 It would be interesting extend IPSCCP to be able to handle simple cases like
1553 this, where all of the arguments to a call are constant.  Because IPSCCP runs
1554 before inlining, trivial templates and inline functions are not yet inlined.
1555 The results for a function + set of constant arguments should be memoized in a
1556 map.
1557
1558 //===---------------------------------------------------------------------===//
1559
1560 The libcall constant folding stuff should be moved out of SimplifyLibcalls into
1561 libanalysis' constantfolding logic.  This would allow IPSCCP to be able to
1562 handle simple things like this:
1563
1564 static int foo(const char *X) { return strlen(X); }
1565 int bar() { return foo("abcd"); }
1566
1567 //===---------------------------------------------------------------------===//
1568
1569 functionattrs doesn't know much about memcpy/memset.  This function should be
1570 marked readnone rather than readonly, since it only twiddles local memory, but
1571 functionattrs doesn't handle memset/memcpy/memmove aggressively:
1572
1573 struct X { int *p; int *q; };
1574 int foo() {
1575  int i = 0, j = 1;
1576  struct X x, y;
1577  int **p;
1578  y.p = &i;
1579  x.q = &j;
1580  p = __builtin_memcpy (&x, &y, sizeof (int *));
1581  return **p;
1582 }
1583
1584 This can be seen at:
1585 $ clang t.c -S -o - -mkernel -O0 -emit-llvm | opt -functionattrs -S
1586
1587
1588 //===---------------------------------------------------------------------===//
1589
1590 Missed instcombine transformation:
1591 define i1 @a(i32 %x) nounwind readnone {
1592 entry:
1593   %cmp = icmp eq i32 %x, 30
1594   %sub = add i32 %x, -30
1595   %cmp2 = icmp ugt i32 %sub, 9
1596   %or = or i1 %cmp, %cmp2
1597   ret i1 %or
1598 }
1599 This should be optimized to a single compare.  Testcase derived from gcc.
1600
1601 //===---------------------------------------------------------------------===//
1602
1603 Missed instcombine or reassociate transformation:
1604 int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
1605
1606 The sgt and slt should be combined into a single comparison. Testcase derived
1607 from gcc.
1608
1609 //===---------------------------------------------------------------------===//
1610
1611 Missed instcombine transformation:
1612
1613   %382 = srem i32 %tmp14.i, 64                    ; [#uses=1]
1614   %383 = zext i32 %382 to i64                     ; [#uses=1]
1615   %384 = shl i64 %381, %383                       ; [#uses=1]
1616   %385 = icmp slt i32 %tmp14.i, 64                ; [#uses=1]
1617
1618 The srem can be transformed to an and because if %tmp14.i is negative, the
1619 shift is undefined.  Testcase derived from 403.gcc.
1620
1621 //===---------------------------------------------------------------------===//
1622
1623 This is a range comparison on a divided result (from 403.gcc):
1624
1625   %1337 = sdiv i32 %1336, 8                       ; [#uses=1]
1626   %.off.i208 = add i32 %1336, 7                   ; [#uses=1]
1627   %1338 = icmp ult i32 %.off.i208, 15             ; [#uses=1]
1628
1629 We already catch this (removing the sdiv) if there isn't an add, we should
1630 handle the 'add' as well.  This is a common idiom with it's builtin_alloca code.
1631 C testcase:
1632
1633 int a(int x) { return (unsigned)(x/16+7) < 15; }
1634
1635 Another similar case involves truncations on 64-bit targets:
1636
1637   %361 = sdiv i64 %.046, 8                        ; [#uses=1]
1638   %362 = trunc i64 %361 to i32                    ; [#uses=2]
1639 ...
1640   %367 = icmp eq i32 %362, 0                      ; [#uses=1]
1641
1642 //===---------------------------------------------------------------------===//
1643
1644 Missed instcombine/dagcombine transformation:
1645 define void @lshift_lt(i8 zeroext %a) nounwind {
1646 entry:
1647   %conv = zext i8 %a to i32
1648   %shl = shl i32 %conv, 3
1649   %cmp = icmp ult i32 %shl, 33
1650   br i1 %cmp, label %if.then, label %if.end
1651
1652 if.then:
1653   tail call void @bar() nounwind
1654   ret void
1655
1656 if.end:
1657   ret void
1658 }
1659 declare void @bar() nounwind
1660
1661 The shift should be eliminated.  Testcase derived from gcc.
1662
1663 //===---------------------------------------------------------------------===//
1664
1665 These compile into different code, one gets recognized as a switch and the
1666 other doesn't due to phase ordering issues (PR6212):
1667
1668 int test1(int mainType, int subType) {
1669   if (mainType == 7)
1670     subType = 4;
1671   else if (mainType == 9)
1672     subType = 6;
1673   else if (mainType == 11)
1674     subType = 9;
1675   return subType;
1676 }
1677
1678 int test2(int mainType, int subType) {
1679   if (mainType == 7)
1680     subType = 4;
1681   if (mainType == 9)
1682     subType = 6;
1683   if (mainType == 11)
1684     subType = 9;
1685   return subType;
1686 }
1687
1688 //===---------------------------------------------------------------------===//
1689
1690 The following test case (from PR6576):
1691
1692 define i32 @mul(i32 %a, i32 %b) nounwind readnone {
1693 entry:
1694  %cond1 = icmp eq i32 %b, 0                      ; <i1> [#uses=1]
1695  br i1 %cond1, label %exit, label %bb.nph
1696 bb.nph:                                           ; preds = %entry
1697  %tmp = mul i32 %b, %a                           ; <i32> [#uses=1]
1698  ret i32 %tmp
1699 exit:                                             ; preds = %entry
1700  ret i32 0
1701 }
1702
1703 could be reduced to:
1704
1705 define i32 @mul(i32 %a, i32 %b) nounwind readnone {
1706 entry:
1707  %tmp = mul i32 %b, %a
1708  ret i32 %tmp
1709 }
1710
1711 //===---------------------------------------------------------------------===//
1712
1713 We should use DSE + llvm.lifetime.end to delete dead vtable pointer updates.
1714 See GCC PR34949
1715
1716 Another interesting case is that something related could be used for variables
1717 that go const after their ctor has finished.  In these cases, globalopt (which
1718 can statically run the constructor) could mark the global const (so it gets put
1719 in the readonly section).  A testcase would be:
1720
1721 #include <complex>
1722 using namespace std;
1723 const complex<char> should_be_in_rodata (42,-42);
1724 complex<char> should_be_in_data (42,-42);
1725 complex<char> should_be_in_bss;
1726
1727 Where we currently evaluate the ctors but the globals don't become const because
1728 the optimizer doesn't know they "become const" after the ctor is done.  See
1729 GCC PR4131 for more examples.
1730
1731 //===---------------------------------------------------------------------===//
1732
1733 In this code:
1734
1735 long foo(long x) {
1736   return x > 1 ? x : 1;
1737 }
1738
1739 LLVM emits a comparison with 1 instead of 0. 0 would be equivalent
1740 and cheaper on most targets.
1741
1742 LLVM prefers comparisons with zero over non-zero in general, but in this
1743 case it choses instead to keep the max operation obvious.
1744
1745 //===---------------------------------------------------------------------===//
1746
1747 define void @a(i32 %x) nounwind {
1748 entry:
1749   switch i32 %x, label %if.end [
1750     i32 0, label %if.then
1751     i32 1, label %if.then
1752     i32 2, label %if.then
1753     i32 3, label %if.then
1754     i32 5, label %if.then
1755   ]
1756 if.then:
1757   tail call void @foo() nounwind
1758   ret void
1759 if.end:
1760   ret void
1761 }
1762 declare void @foo()
1763
1764 Generated code on x86-64 (other platforms give similar results):
1765 a:
1766         cmpl    $5, %edi
1767         ja      LBB2_2
1768         cmpl    $4, %edi
1769         jne     LBB2_3
1770 .LBB0_2:
1771         ret
1772 .LBB0_3:
1773         jmp     foo  # TAILCALL
1774
1775 If we wanted to be really clever, we could simplify the whole thing to
1776 something like the following, which eliminates a branch:
1777         xorl    $1, %edi
1778         cmpl    $4, %edi
1779         ja      .LBB0_2
1780         ret
1781 .LBB0_2:
1782         jmp     foo  # TAILCALL
1783
1784 //===---------------------------------------------------------------------===//
1785
1786 We compile this:
1787
1788 int foo(int a) { return (a & (~15)) / 16; }
1789
1790 Into:
1791
1792 define i32 @foo(i32 %a) nounwind readnone ssp {
1793 entry:
1794   %and = and i32 %a, -16
1795   %div = sdiv i32 %and, 16
1796   ret i32 %div
1797 }
1798
1799 but this code (X & -A)/A is X >> log2(A) when A is a power of 2, so this case
1800 should be instcombined into just "a >> 4".
1801
1802 We do get this at the codegen level, so something knows about it, but
1803 instcombine should catch it earlier:
1804
1805 _foo:                                   ## @foo
1806 ## BB#0:                                ## %entry
1807         movl    %edi, %eax
1808         sarl    $4, %eax
1809         ret
1810
1811 //===---------------------------------------------------------------------===//
1812
1813 This code (from GCC PR28685):
1814
1815 int test(int a, int b) {
1816   int lt = a < b;
1817   int eq = a == b;
1818   if (lt)
1819     return 1;
1820   return eq;
1821 }
1822
1823 Is compiled to:
1824
1825 define i32 @test(i32 %a, i32 %b) nounwind readnone ssp {
1826 entry:
1827   %cmp = icmp slt i32 %a, %b
1828   br i1 %cmp, label %return, label %if.end
1829
1830 if.end:                                           ; preds = %entry
1831   %cmp5 = icmp eq i32 %a, %b
1832   %conv6 = zext i1 %cmp5 to i32
1833   ret i32 %conv6
1834
1835 return:                                           ; preds = %entry
1836   ret i32 1
1837 }
1838
1839 it could be:
1840
1841 define i32 @test__(i32 %a, i32 %b) nounwind readnone ssp {
1842 entry:
1843   %0 = icmp sle i32 %a, %b
1844   %retval = zext i1 %0 to i32
1845   ret i32 %retval
1846 }
1847
1848 //===---------------------------------------------------------------------===//
1849
1850 This code can be seen in viterbi:
1851
1852   %64 = call noalias i8* @malloc(i64 %62) nounwind
1853 ...
1854   %67 = call i64 @llvm.objectsize.i64(i8* %64, i1 false) nounwind
1855   %68 = call i8* @__memset_chk(i8* %64, i32 0, i64 %62, i64 %67) nounwind
1856
1857 llvm.objectsize.i64 should be taught about malloc/calloc, allowing it to
1858 fold to %62.  This is a security win (overflows of malloc will get caught)
1859 and also a performance win by exposing more memsets to the optimizer.
1860
1861 This occurs several times in viterbi.
1862
1863 Note that this would change the semantics of @llvm.objectsize which by its
1864 current definition always folds to a constant. We also should make sure that
1865 we remove checking in code like
1866
1867   char *p = malloc(strlen(s)+1);
1868   __strcpy_chk(p, s, __builtin_objectsize(p, 0));
1869
1870 //===---------------------------------------------------------------------===//
1871
1872 This code (from Benchmarks/Dhrystone/dry.c):
1873
1874 define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
1875 entry:
1876   %sext = shl i32 %0, 24
1877   %conv = ashr i32 %sext, 24
1878   %sext6 = shl i32 %1, 24
1879   %conv4 = ashr i32 %sext6, 24
1880   %cmp = icmp eq i32 %conv, %conv4
1881   %. = select i1 %cmp, i32 10000, i32 0
1882   ret i32 %.
1883 }
1884
1885 Should be simplified into something like:
1886
1887 define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
1888 entry:
1889   %sext = shl i32 %0, 24
1890   %conv = and i32 %sext, 0xFF000000
1891   %sext6 = shl i32 %1, 24
1892   %conv4 = and i32 %sext6, 0xFF000000
1893   %cmp = icmp eq i32 %conv, %conv4
1894   %. = select i1 %cmp, i32 10000, i32 0
1895   ret i32 %.
1896 }
1897
1898 and then to:
1899
1900 define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
1901 entry:
1902   %conv = and i32 %0, 0xFF
1903   %conv4 = and i32 %1, 0xFF
1904   %cmp = icmp eq i32 %conv, %conv4
1905   %. = select i1 %cmp, i32 10000, i32 0
1906   ret i32 %.
1907 }
1908 //===---------------------------------------------------------------------===//
1909
1910 clang -O3 currently compiles this code
1911
1912 int g(unsigned int a) {
1913   unsigned int c[100];
1914   c[10] = a;
1915   c[11] = a;
1916   unsigned int b = c[10] + c[11];
1917   if(b > a*2) a = 4;
1918   else a = 8;
1919   return a + 7;
1920 }
1921
1922 into
1923
1924 define i32 @g(i32 a) nounwind readnone {
1925   %add = shl i32 %a, 1
1926   %mul = shl i32 %a, 1
1927   %cmp = icmp ugt i32 %add, %mul
1928   %a.addr.0 = select i1 %cmp, i32 11, i32 15
1929   ret i32 %a.addr.0
1930 }
1931
1932 The icmp should fold to false. This CSE opportunity is only available
1933 after GVN and InstCombine have run.
1934
1935 //===---------------------------------------------------------------------===//
1936
1937 memcpyopt should turn this:
1938
1939 define i8* @test10(i32 %x) {
1940   %alloc = call noalias i8* @malloc(i32 %x) nounwind
1941   call void @llvm.memset.p0i8.i32(i8* %alloc, i8 0, i32 %x, i32 1, i1 false)
1942   ret i8* %alloc
1943 }
1944
1945 into a call to calloc.  We should make sure that we analyze calloc as
1946 aggressively as malloc though.
1947
1948 //===---------------------------------------------------------------------===//
1949
1950 clang -O3 doesn't optimize this:
1951
1952 void f1(int* begin, int* end) {
1953   std::fill(begin, end, 0);
1954 }
1955
1956 into a memset.  This is PR8942.
1957
1958 //===---------------------------------------------------------------------===//
1959
1960 clang -O3 -fno-exceptions currently compiles this code:
1961
1962 void f(int N) {
1963   std::vector<int> v(N);
1964
1965   extern void sink(void*); sink(&v);
1966 }
1967
1968 into
1969
1970 define void @_Z1fi(i32 %N) nounwind {
1971 entry:
1972   %v2 = alloca [3 x i32*], align 8
1973   %v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
1974   %tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
1975   %conv = sext i32 %N to i64
1976   store i32* null, i32** %v2.sub, align 8, !tbaa !0
1977   %tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
1978   store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
1979   %tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
1980   store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
1981   %cmp.i.i.i.i = icmp eq i32 %N, 0
1982   br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i
1983
1984 _ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
1985   store i32* null, i32** %v2.sub, align 8, !tbaa !0
1986   store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
1987   %add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
1988   store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
1989   br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
1990
1991 cond.true.i.i.i.i:                                ; preds = %entry
1992   %cmp.i.i.i.i.i = icmp slt i32 %N, 0
1993   br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i
1994
1995 if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
1996   call void @_ZSt17__throw_bad_allocv() noreturn nounwind
1997   unreachable
1998
1999 _ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i:    ; preds = %cond.true.i.i.i.i
2000   %mul.i.i.i.i.i = shl i64 %conv, 2
2001   %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
2002   %0 = bitcast i8* %call3.i.i.i.i.i to i32*
2003   store i32* %0, i32** %v2.sub, align 8, !tbaa !0
2004   store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
2005   %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
2006   store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
2007   call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
2008   br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
2009
2010 This is just the handling the construction of the vector. Most surprising here
2011 is the fact that all three null stores in %entry are dead (because we do no
2012 cross-block DSE).
2013
2014 Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
2015 This is a because the client of LazyValueInfo doesn't simplify all instruction
2016 operands, just selected ones.
2017
2018 //===---------------------------------------------------------------------===//
2019
2020 clang -O3 -fno-exceptions currently compiles this code:
2021
2022 void f(char* a, int n) {
2023   __builtin_memset(a, 0, n);
2024   for (int i = 0; i < n; ++i)
2025     a[i] = 0;
2026 }
2027
2028 into:
2029
2030 define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
2031 entry:
2032   %conv = sext i32 %n to i64
2033   tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
2034   %cmp8 = icmp sgt i32 %n, 0
2035   br i1 %cmp8, label %for.body.lr.ph, label %for.end
2036
2037 for.body.lr.ph:                                   ; preds = %entry
2038   %tmp10 = add i32 %n, -1
2039   %tmp11 = zext i32 %tmp10 to i64
2040   %tmp12 = add i64 %tmp11, 1
2041   call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
2042   ret void
2043
2044 for.end:                                          ; preds = %entry
2045   ret void
2046 }
2047
2048 This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
2049 the two memset's together.
2050
2051 The issue with the addition only occurs in 64-bit mode, and appears to be at
2052 least partially caused by Scalar Evolution not keeping its cache updated: it
2053 returns the "wrong" result immediately after indvars runs, but figures out the
2054 expected result if it is run from scratch on IR resulting from running indvars.
2055
2056 //===---------------------------------------------------------------------===//
2057
2058 clang -O3 -fno-exceptions currently compiles this code:
2059
2060 struct S {
2061   unsigned short m1, m2;
2062   unsigned char m3, m4;
2063 };
2064
2065 void f(int N) {
2066   std::vector<S> v(N);
2067   extern void sink(void*); sink(&v);
2068 }
2069
2070 into poor code for zero-initializing 'v' when N is >0. The problem is that
2071 S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
2072 4 stores on each iteration. If the struct were 8 bytes, this gets turned into
2073 a memset.
2074
2075 In order to handle this we have to:
2076   A) Teach clang to generate metadata for memsets of structs that have holes in
2077      them.
2078   B) Teach clang to use such a memset for zero init of this struct (since it has
2079      a hole), instead of doing elementwise zeroing.
2080
2081 //===---------------------------------------------------------------------===//
2082
2083 clang -O3 currently compiles this code:
2084
2085 extern const int magic;
2086 double f() { return 0.0 * magic; }
2087
2088 into
2089
2090 @magic = external constant i32
2091
2092 define double @_Z1fv() nounwind readnone {
2093 entry:
2094   %tmp = load i32* @magic, align 4, !tbaa !0
2095   %conv = sitofp i32 %tmp to double
2096   %mul = fmul double %conv, 0.000000e+00
2097   ret double %mul
2098 }
2099
2100 We should be able to fold away this fmul to 0.0.  More generally, fmul(x,0.0)
2101 can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
2102 not an INF.  The CannotBeNegativeZero predicate in value tracking should be
2103 extended to support general "fpclassify" operations that can return
2104 yes/no/unknown for each of these predicates.
2105
2106 In this predicate, we know that uitofp is trivially never NaN or -0.0, and
2107 we know that it isn't +/-Inf if the floating point type has enough exponent bits
2108 to represent the largest integer value as < inf.
2109
2110 //===---------------------------------------------------------------------===//
2111
2112 When optimizing a transformation that can change the sign of 0.0 (such as the
2113 0.0*val -> 0.0 transformation above), it might be provable that the sign of the
2114 expression doesn't matter.  For example, by the above rules, we can't transform
2115 fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
2116 expression is defined to be -0.0.
2117
2118 If we look at the uses of the fmul for example, we might be able to prove that
2119 all uses don't care about the sign of zero.  For example, if we have:
2120
2121   fadd(fmul(sitofp(x), 0.0), 2.0)
2122
2123 Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
2124 transform the fmul to 0.0, and then the fadd to 2.0.
2125
2126 //===---------------------------------------------------------------------===//
2127
2128 We should enhance memcpy/memcpy/memset to allow a metadata node on them
2129 indicating that some bytes of the transfer are undefined.  This is useful for
2130 frontends like clang when lowering struct copies, when some elements of the
2131 struct are undefined.  Consider something like this:
2132
2133 struct x {
2134   char a;
2135   int b[4];
2136 };
2137 void foo(struct x*P);
2138 struct x testfunc() {
2139   struct x V1, V2;
2140   foo(&V1);
2141   V2 = V1;
2142
2143   return V2;
2144 }
2145
2146 We currently compile this to:
2147 $ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
2148
2149
2150 %struct.x = type { i8, [4 x i32] }
2151
2152 define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
2153 entry:
2154   %V1 = alloca %struct.x, align 4
2155   call void @foo(%struct.x* %V1)
2156   %tmp1 = bitcast %struct.x* %V1 to i8*
2157   %0 = bitcast %struct.x* %V1 to i160*
2158   %srcval1 = load i160* %0, align 4
2159   %tmp2 = bitcast %struct.x* %agg.result to i8*
2160   %1 = bitcast %struct.x* %agg.result to i160*
2161   store i160 %srcval1, i160* %1, align 4
2162   ret void
2163 }
2164
2165 This happens because SRoA sees that the temp alloca has is being memcpy'd into
2166 and out of and it has holes and it has to be conservative.  If we knew about the
2167 holes, then this could be much much better.
2168
2169 Having information about these holes would also improve memcpy (etc) lowering at
2170 llc time when it gets inlined, because we can use smaller transfers.  This also
2171 avoids partial register stalls in some important cases.
2172
2173 //===---------------------------------------------------------------------===//
2174
2175 We don't fold (icmp (add) (add)) unless the two adds only have a single use.
2176 There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for
2177 example:
2178
2179  %indvar.next90 = add i64 %indvar89, 1     ;; Has 2 uses
2180  %tmp96 = add i64 %tmp95, 1                ;; Has 1 use
2181  %exitcond97 = icmp eq i64 %indvar.next90, %tmp96
2182
2183 We don't fold this because we don't want to introduce an overlapped live range
2184 of the ivar.  However if we can make this more aggressive without causing
2185 performance issues in two ways:
2186
2187 1. If *either* the LHS or RHS has a single use, we can definitely do the
2188    transformation.  In the overlapping liverange case we're trading one register
2189    use for one fewer operation, which is a reasonable trade.  Before doing this
2190    we should verify that the llc output actually shrinks for some benchmarks.
2191 2. If both ops have multiple uses, we can still fold it if the operations are
2192    both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't
2193    increase register pressure.
2194
2195 There are a ton of icmp's we aren't simplifying because of the reg pressure
2196 concern.  Care is warranted here though because many of these are induction
2197 variables and other cases that matter a lot to performance, like the above.
2198 Here's a blob of code that you can drop into the bottom of visitICmp to see some
2199 missed cases:
2200
2201   { Value *A, *B, *C, *D;
2202     if (match(Op0, m_Add(m_Value(A), m_Value(B))) &&
2203         match(Op1, m_Add(m_Value(C), m_Value(D))) &&
2204         (A == C || A == D || B == C || B == D)) {
2205       errs() << "OP0 = " << *Op0 << "  U=" << Op0->getNumUses() << "\n";
2206       errs() << "OP1 = " << *Op1 << "  U=" << Op1->getNumUses() << "\n";
2207       errs() << "CMP = " << I << "\n\n";
2208     }
2209   }
2210
2211 //===---------------------------------------------------------------------===//
2212
2213 define i1 @test1(i32 %x) nounwind {
2214   %and = and i32 %x, 3
2215   %cmp = icmp ult i32 %and, 2
2216   ret i1 %cmp
2217 }
2218
2219 Can be folded to (x & 2) == 0.
2220
2221 define i1 @test2(i32 %x) nounwind {
2222   %and = and i32 %x, 3
2223   %cmp = icmp ugt i32 %and, 1
2224   ret i1 %cmp
2225 }
2226
2227 Can be folded to (x & 2) != 0.
2228
2229 SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
2230 icmp transform.
2231
2232 //===---------------------------------------------------------------------===//
2233
2234 This code:
2235
2236 typedef struct {
2237 int f1:1;
2238 int f2:1;
2239 int f3:1;
2240 int f4:29;
2241 } t1;
2242
2243 typedef struct {
2244 int f1:1;
2245 int f2:1;
2246 int f3:30;
2247 } t2;
2248
2249 t1 s1;
2250 t2 s2;
2251
2252 void func1(void)
2253 {
2254 s1.f1 = s2.f1;
2255 s1.f2 = s2.f2;
2256 }
2257
2258 Compiles into this IR (on x86-64 at least):
2259
2260 %struct.t1 = type { i8, [3 x i8] }
2261 @s2 = global %struct.t1 zeroinitializer, align 4
2262 @s1 = global %struct.t1 zeroinitializer, align 4
2263 define void @func1() nounwind ssp noredzone {
2264 entry:
2265   %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
2266   %bf.val.sext5 = and i32 %0, 1
2267   %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
2268   %2 = and i32 %1, -4
2269   %3 = or i32 %2, %bf.val.sext5
2270   %bf.val.sext26 = and i32 %0, 2
2271   %4 = or i32 %3, %bf.val.sext26
2272   store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
2273   ret void
2274 }
2275
2276 The two or/and's should be merged into one each.
2277
2278 //===---------------------------------------------------------------------===//
2279
2280 Machine level code hoisting can be useful in some cases.  For example, PR9408
2281 is about:
2282
2283 typedef union {
2284  void (*f1)(int);
2285  void (*f2)(long);
2286 } funcs;
2287
2288 void foo(funcs f, int which) {
2289  int a = 5;
2290  if (which) {
2291    f.f1(a);
2292  } else {
2293    f.f2(a);
2294  }
2295 }
2296
2297 which we compile to:
2298
2299 foo:                                    # @foo
2300 # BB#0:                                 # %entry
2301        pushq   %rbp
2302        movq    %rsp, %rbp
2303        testl   %esi, %esi
2304        movq    %rdi, %rax
2305        je      .LBB0_2
2306 # BB#1:                                 # %if.then
2307        movl    $5, %edi
2308        callq   *%rax
2309        popq    %rbp
2310        ret
2311 .LBB0_2:                                # %if.else
2312        movl    $5, %edi
2313        callq   *%rax
2314        popq    %rbp
2315        ret
2316
2317 Note that bb1 and bb2 are the same.  This doesn't happen at the IR level
2318 because one call is passing an i32 and the other is passing an i64.
2319
2320 //===---------------------------------------------------------------------===//
2321
2322 I see this sort of pattern in 176.gcc in a few places (e.g. the start of
2323 store_bit_field).  The rem should be replaced with a multiply and subtract:
2324
2325   %3 = sdiv i32 %A, %B
2326   %4 = srem i32 %A, %B
2327
2328 Similarly for udiv/urem.  Note that this shouldn't be done on X86 or ARM,
2329 which can do this in a single operation (instruction or libcall).  It is
2330 probably best to do this in the code generator.
2331
2332 //===---------------------------------------------------------------------===//
2333
2334 unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
2335 should fold to (x & y) == 0.
2336
2337 //===---------------------------------------------------------------------===//
2338
2339 unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
2340 should fold to x > y.
2341
2342 //===---------------------------------------------------------------------===//