lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 With the recent changes to make the implicit def/use set explicit in
   6 machineinstrs, we should change the target descriptions for 'call' instructions
   7 so that the .td files don't list all the call-clobbered registers as implicit
   8 defs.  Instead, these should be added by the code generator (e.g. on the dag).
   9
  10 This has a number of uses:
  11
  12 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  13    for their different impdef sets.
  14 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  15    sets don't need copies of call instructions.
  16 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  17    of calls.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 FreeBench/mason contains code like this:
  22
  23 static p_type m0u(p_type p) {
  24   int m[]={0, 8, 1, 2, 16, 5, 13, 7, 14, 9, 3, 4, 11, 12, 15, 10, 17, 6};
  25   p_type pu;
  26   pu.a = m[p.a];
  27   pu.b = m[p.b];
  28   pu.c = m[p.c];
  29   return pu;
  30 }
  31
  32 We currently compile this into a memcpy from a static array into 'm', then
  33 a bunch of loads from m.  It would be better to avoid the memcpy and just do
  34 loads from the static array.
  35
  36 //===---------------------------------------------------------------------===//
  37
  38 Make the PPC branch selector target independant
  39
  40 //===---------------------------------------------------------------------===//
  41
  42 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  43 precision don't matter (ffastmath).  Misc/mandel will like this. :)
  44
  45 //===---------------------------------------------------------------------===//
  46
  47 Solve this DAG isel folding deficiency:
  48
  49 int X, Y;
  50
  51 void fn1(void)
  52 {
  53   X = X | (Y << 3);
  54 }
  55
  56 compiles to
  57
  58 fn1:
  59         movl Y, %eax
  60         shll $3, %eax
  61         orl X, %eax
  62         movl %eax, X
  63         ret
  64
  65 The problem is the store's chain operand is not the load X but rather
  66 a TokenFactor of the load X and load Y, which prevents the folding.
  67
  68 There are two ways to fix this:
  69
  70 1. The dag combiner can start using alias analysis to realize that y/x
  71    don't alias, making the store to X not dependent on the load from Y.
  72 2. The generated isel could be made smarter in the case it can't
  73    disambiguate the pointers.
  74
  75 Number 1 is the preferred solution.
  76
  77 This has been "fixed" by a TableGen hack. But that is a short term workaround
  78 which will be removed once the proper fix is made.
  79
  80 //===---------------------------------------------------------------------===//
  81
  82 On targets with expensive 64-bit multiply, we could LSR this:
  83
  84 for (i = ...; ++i) {
  85    x = 1ULL << i;
  86
  87 into:
  88  long long tmp = 1;
  89  for (i = ...; ++i, tmp+=tmp)
  90    x = tmp;
  91
  92 This would be a win on ppc32, but not x86 or ppc64.
  93
  94 //===---------------------------------------------------------------------===//
  95
  96 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  97
  98 //===---------------------------------------------------------------------===//
  99
 100 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
 101
 102 //===---------------------------------------------------------------------===//
 103
 104 Interesting? testcase for add/shift/mul reassoc:
 105
 106 int bar(int x, int y) {
 107   return x*x*x+y+x*x*x*x*x*y*y*y*y;
 108 }
 109 int foo(int z, int n) {
 110   return bar(z, n) + bar(2*z, 2*n);
 111 }
 112
 113 //===---------------------------------------------------------------------===//
 114
 115 These two functions should generate the same code on big-endian systems:
 116
 117 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 118 int h(int *j, int *l) {  return *j - *l; }
 119
 120 this could be done in SelectionDAGISel.cpp, along with other special cases,
 121 for 1,2,4,8 bytes.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Add LSR exit value substitution. It'll probably be a win for Ackermann, etc.
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 It would be nice to revert this patch:
 130 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 131
 132 And teach the dag combiner enough to simplify the code expanded before
 133 legalize.  It seems plausible that this knowledge would let it simplify other
 134 stuff too.
 135
 136 //===---------------------------------------------------------------------===//
 137
 138 For packed types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 139 to the type size. It works but can be overly conservative as the alignment of
 140 specific packed types are target dependent.
 141
 142 //===---------------------------------------------------------------------===//
 143
 144 We should add 'unaligned load/store' nodes, and produce them from code like
 145 this:
 146
 147 v4sf example(float *P) {
 148   return (v4sf){P[0], P[1], P[2], P[3] };
 149 }
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 We should constant fold packed type casts at the LLVM level, regardless of the
 154 cast.  Currently we cannot fold some casts because we don't have TargetData
 155 information in the constant folder, so we don't know the endianness of the
 156 target!
 157
 158 //===---------------------------------------------------------------------===//
 159
 160 Add support for conditional increments, and other related patterns.  Instead
 161 of:
 162
 163         movl 136(%esp), %eax
 164         cmpl $0, %eax
 165         je LBB16_2      #cond_next
 166 LBB16_1:        #cond_true
 167         incl _foo
 168 LBB16_2:        #cond_next
 169
 170 emit:
 171         movl    _foo, %eax
 172         cmpl    $1, %edi
 173         sbbl    $-1, %eax
 174         movl    %eax, _foo
 175
 176 //===---------------------------------------------------------------------===//
 177
 178 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 179
 180 Expand these to calls of sin/cos and stores:
 181       double sincos(double x, double *sin, double *cos);
 182       float sincosf(float x, float *sin, float *cos);
 183       long double sincosl(long double x, long double *sin, long double *cos);
 184
 185 Doing so could allow SROA of the destination pointers.  See also:
 186 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 187
 188 //===---------------------------------------------------------------------===//
 189
 190 Scalar Repl cannot currently promote this testcase to 'ret long cst':
 191
 192         %struct.X = type { int, int }
 193         %struct.Y = type { %struct.X }
 194 ulong %bar() {
 195         %retval = alloca %struct.Y, align 8
 196         %tmp12 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 0
 197         store int 0, int* %tmp12
 198         %tmp15 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 1
 199         store int 1, int* %tmp15
 200         %retval = bitcast %struct.Y* %retval to ulong*
 201         %retval = load ulong* %retval
 202         ret ulong %retval
 203 }
 204
 205 it should be extended to do so.
 206
 207 //===---------------------------------------------------------------------===//
 208
 209 -scalarrepl should promote this to be a vector scalar.
 210
 211         %struct..0anon = type { <4 x float> }
 212
 213 implementation   ; Functions:
 214
 215 void %test1(<4 x float> %V, float* %P) {
 216         %u = alloca %struct..0anon, align 16
 217         %tmp = getelementptr %struct..0anon* %u, int 0, uint 0
 218         store <4 x float> %V, <4 x float>* %tmp
 219         %tmp1 = bitcast %struct..0anon* %u to [4 x float]*
 220         %tmp = getelementptr [4 x float]* %tmp1, int 0, int 1
 221         %tmp = load float* %tmp
 222         %tmp3 = mul float %tmp, 2.000000e+00
 223         store float %tmp3, float* %P
 224         ret void
 225 }
 226
 227 //===---------------------------------------------------------------------===//
 228
 229 Turn this into a single byte store with no load (the other 3 bytes are
 230 unmodified):
 231
 232 void %test(uint* %P) {
 233         %tmp = load uint* %P
 234         %tmp14 = or uint %tmp, 3305111552
 235         %tmp15 = and uint %tmp14, 3321888767
 236         store uint %tmp15, uint* %P
 237         ret void
 238 }
 239
 240 //===---------------------------------------------------------------------===//
 241
 242 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 243
 244 Compile:
 245
 246 int bar(int x)
 247 {
 248   int t = __builtin_clz(x);
 249   return -(t>>5);
 250 }
 251
 252 to:
 253
 254 _bar:   addic r3,r3,-1
 255         subfe r3,r3,r3
 256         blr
 257
 258 //===---------------------------------------------------------------------===//
 259
 260 Legalize should lower ctlz like this:
 261   ctlz(x) = popcnt((x-1) & ~x)
 262
 263 on targets that have popcnt but not ctlz.  itanium, what else?
 264
 265 //===---------------------------------------------------------------------===//
 266
 267 quantum_sigma_x in 462.libquantum contains the following loop:
 268
 269       for(i=0; i<reg->size; i++)
 270         {
 271           /* Flip the target bit of each basis state */
 272           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 273         }
 274
 275 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 276 so cool to turn it into something like:
 277
 278    long long Res = ((MAX_UNSIGNED) 1 << target);
 279    if (target < 32) {
 280      for(i=0; i<reg->size; i++)
 281        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 282    } else {
 283      for(i=0; i<reg->size; i++)
 284        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 285    }
 286
 287 ... which would only do one 32-bit XOR per loop iteration instead of two.
 288
 289 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 290 alas...
 291
 292 //===---------------------------------------------------------------------===//
 293
 294 This isn't recognized as bswap by instcombine:
 295
 296 unsigned int swap_32(unsigned int v) {
 297   v = ((v & 0x00ff00ffU) << 8)  | ((v & 0xff00ff00U) >> 8);
 298   v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16);
 299   return v;
 300 }
 301
 302 Nor is this (yes, it really is bswap):
 303
 304 unsigned long reverse(unsigned v) {
 305     unsigned t;
 306     t = v ^ ((v << 16) | (v >> 16));
 307     t &= ~0xff0000;
 308     v = (v << 24) | (v >> 8);
 309     return v ^ (t >> 8);
 310 }
 311
 312 //===---------------------------------------------------------------------===//
 313
 314 These should turn into single 16-bit (unaligned?) loads on little/big endian
 315 processors.
 316
 317 unsigned short read_16_le(const unsigned char *adr) {
 318   return adr[0] | (adr[1] << 8);
 319 }
 320 unsigned short read_16_be(const unsigned char *adr) {
 321   return (adr[0] << 8) | adr[1];
 322 }
 323
 324 //===---------------------------------------------------------------------===//
 325
 326 -instcombine should handle this transform:
 327    icmp pred (sdiv X / C1 ), C2
 328 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 329
 330 Currently InstCombine avoids this transform but will do it when the signs of
 331 the operands and the sign of the divide match. See the FIXME in
 332 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 333 for Instruction::UDiv (around line 4447) for more details.
 334
 335 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 336 this construct.
 337
 338 //===---------------------------------------------------------------------===//
 339
 340 Instcombine misses several of these cases (see the testcase in the patch):
 341 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html
 342
 343 //===---------------------------------------------------------------------===//
 344
 345 viterbi speeds up *significantly* if the various "history" related copy loops
 346 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 347 pass.
 348
 349 //===---------------------------------------------------------------------===//
 350
 351 -predsimplify should transform this:
 352
 353 void bad(unsigned x)
 354 {
 355   if (x > 4)
 356     bar(12);
 357   else if (x > 3)
 358     bar(523);
 359   else if (x > 2)
 360     bar(36);
 361   else if (x > 1)
 362     bar(65);
 363   else if (x > 0)
 364     bar(45);
 365   else
 366     bar(367);
 367 }
 368
 369 into:
 370
 371 void good(unsigned x)
 372 {
 373   if (x == 4)
 374     bar(523);
 375   else if (x == 3)
 376     bar(36);
 377   else if (x == 2)
 378     bar(65);
 379   else if (x == 1)
 380     bar(45);
 381   else if (x == 0)
 382     bar(367);
 383   else
 384     bar(12);
 385 }
 386
 387 to enable further optimizations.
 388
 389 //===---------------------------------------------------------------------===//
 390
 391 Consider:
 392
 393 typedef unsigned U32;
 394 typedef unsigned long long U64;
 395 int test (U32 *inst, U64 *regs) {
 396     U64 effective_addr2;
 397     U32 temp = *inst;
 398     int r1 = (temp >> 20) & 0xf;
 399     int b2 = (temp >> 16) & 0xf;
 400     effective_addr2 = temp & 0xfff;
 401     if (b2) effective_addr2 += regs[b2];
 402     b2 = (temp >> 12) & 0xf;
 403     if (b2) effective_addr2 += regs[b2];
 404     effective_addr2 &= regs[4];
 405      if ((effective_addr2 & 3) == 0)
 406         return 1;
 407     return 0;
 408 }
 409
 410 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 411 we don't eliminate the computation of the top half of effective_addr2 because
 412 we don't have whole-function selection dags.  On x86, this means we use one
 413 extra register for the function when effective_addr2 is declared as U64 than
 414 when it is declared U32.
 415
 416 //===---------------------------------------------------------------------===//
 417
 418 Promote for i32 bswap can use i64 bswap + shr.  Useful on targets with 64-bit
 419 regs and bswap, like itanium.
 420
 421 //===---------------------------------------------------------------------===//