lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 ===-------------------------------------------------------------------------===
   4
   5 FreeBench/mason contains code like this:
   6
   7 static p_type m0u(p_type p) {
   8   int m[]={0, 8, 1, 2, 16, 5, 13, 7, 14, 9, 3, 4, 11, 12, 15, 10, 17, 6};
   9   p_type pu;
  10   pu.a = m[p.a];
  11   pu.b = m[p.b];
  12   pu.c = m[p.c];
  13   return pu;
  14 }
  15
  16 We currently compile this into a memcpy from a static array into 'm', then
  17 a bunch of loads from m.  It would be better to avoid the memcpy and just do
  18 loads from the static array.
  19
  20 //===---------------------------------------------------------------------===//
  21
  22 Make the PPC branch selector target independant
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  27 precision don't matter (ffastmath).  Misc/mandel will like this. :)
  28
  29 //===---------------------------------------------------------------------===//
  30
  31 Solve this DAG isel folding deficiency:
  32
  33 int X, Y;
  34
  35 void fn1(void)
  36 {
  37   X = X | (Y << 3);
  38 }
  39
  40 compiles to
  41
  42 fn1:
  43         movl Y, %eax
  44         shll $3, %eax
  45         orl X, %eax
  46         movl %eax, X
  47         ret
  48
  49 The problem is the store's chain operand is not the load X but rather
  50 a TokenFactor of the load X and load Y, which prevents the folding.
  51
  52 There are two ways to fix this:
  53
  54 1. The dag combiner can start using alias analysis to realize that y/x
  55    don't alias, making the store to X not dependent on the load from Y.
  56 2. The generated isel could be made smarter in the case it can't
  57    disambiguate the pointers.
  58
  59 Number 1 is the preferred solution.
  60
  61 This has been "fixed" by a TableGen hack. But that is a short term workaround
  62 which will be removed once the proper fix is made.
  63
  64 //===---------------------------------------------------------------------===//
  65
  66 On targets with expensive 64-bit multiply, we could LSR this:
  67
  68 for (i = ...; ++i) {
  69    x = 1ULL << i;
  70
  71 into:
  72  long long tmp = 1;
  73  for (i = ...; ++i, tmp+=tmp)
  74    x = tmp;
  75
  76 This would be a win on ppc32, but not x86 or ppc64.
  77
  78 //===---------------------------------------------------------------------===//
  79
  80 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  81
  82 //===---------------------------------------------------------------------===//
  83
  84 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
  85
  86 //===---------------------------------------------------------------------===//
  87
  88 Interesting? testcase for add/shift/mul reassoc:
  89
  90 int bar(int x, int y) {
  91   return x*x*x+y+x*x*x*x*x*y*y*y*y;
  92 }
  93 int foo(int z, int n) {
  94   return bar(z, n) + bar(2*z, 2*n);
  95 }
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 These two functions should generate the same code on big-endian systems:
 100
 101 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 102 int h(int *j, int *l) {  return *j - *l; }
 103
 104 this could be done in SelectionDAGISel.cpp, along with other special cases,
 105 for 1,2,4,8 bytes.
 106
 107 //===---------------------------------------------------------------------===//
 108
 109 This code:
 110 int rot(unsigned char b) { int a = ((b>>1) ^ (b<<7)) & 0xff; return a; }
 111
 112 Can be improved in two ways:
 113
 114 1. The instcombiner should eliminate the type conversions.
 115 2. The X86 backend should turn this into a rotate by one bit.
 116
 117 //===---------------------------------------------------------------------===//
 118
 119 Add LSR exit value substitution. It'll probably be a win for Ackermann, etc.
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 It would be nice to revert this patch:
 124 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 125
 126 And teach the dag combiner enough to simplify the code expanded before
 127 legalize.  It seems plausible that this knowledge would let it simplify other
 128 stuff too.
 129
 130 //===---------------------------------------------------------------------===//
 131
 132 For packed types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 133 to the type size. It works but can be overly conservative as the alignment of
 134 specific packed types are target dependent.
 135
 136 //===---------------------------------------------------------------------===//
 137
 138 We should add 'unaligned load/store' nodes, and produce them from code like
 139 this:
 140
 141 v4sf example(float *P) {
 142   return (v4sf){P[0], P[1], P[2], P[3] };
 143 }
 144
 145 //===---------------------------------------------------------------------===//
 146
 147 We should constant fold packed type casts at the LLVM level, regardless of the
 148 cast.  Currently we cannot fold some casts because we don't have TargetData
 149 information in the constant folder, so we don't know the endianness of the
 150 target!
 151
 152 //===---------------------------------------------------------------------===//
 153
 154 Add support for conditional increments, and other related patterns.  Instead
 155 of:
 156
 157         movl 136(%esp), %eax
 158         cmpl $0, %eax
 159         je LBB16_2      #cond_next
 160 LBB16_1:        #cond_true
 161         incl _foo
 162 LBB16_2:        #cond_next
 163
 164 emit:
 165         movl    _foo, %eax
 166         cmpl    $1, %edi
 167         sbbl    $-1, %eax
 168         movl    %eax, _foo
 169
 170 //===---------------------------------------------------------------------===//
 171
 172 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 173
 174 Expand these to calls of sin/cos and stores:
 175       double sincos(double x, double *sin, double *cos);
 176       float sincosf(float x, float *sin, float *cos);
 177       long double sincosl(long double x, long double *sin, long double *cos);
 178
 179 Doing so could allow SROA of the destination pointers.  See also:
 180 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 181
 182 //===---------------------------------------------------------------------===//
 183
 184 Scalar Repl cannot currently promote this testcase to 'ret long cst':
 185
 186         %struct.X = type { int, int }
 187         %struct.Y = type { %struct.X }
 188 ulong %bar() {
 189         %retval = alloca %struct.Y, align 8             ; <%struct.Y*> [#uses=3]
 190         %tmp12 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 0
 191         store int 0, int* %tmp12
 192         %tmp15 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 1
 193         store int 1, int* %tmp15
 194         %retval = cast %struct.Y* %retval to ulong*
 195         %retval = load ulong* %retval           ; <ulong> [#uses=1]
 196         ret ulong %retval
 197 }
 198
 199 it should be extended to do so.
 200
 201 //===---------------------------------------------------------------------===//
 202
 203 Turn this into a single byte store with no load (the other 3 bytes are
 204 unmodified):
 205
 206 void %test(uint* %P) {
 207         %tmp = load uint* %P
 208         %tmp14 = or uint %tmp, 3305111552
 209         %tmp15 = and uint %tmp14, 3321888767
 210         store uint %tmp15, uint* %P
 211         ret void
 212 }
 213
 214 //===---------------------------------------------------------------------===//
 215
 216 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 217
 218 Compile:
 219
 220 int bar(int x)
 221 {
 222   int t = __builtin_clz(x);
 223   return -(t>>5);
 224 }
 225
 226 to:
 227
 228 _bar:   addic r3,r3,-1
 229         subfe r3,r3,r3
 230         blr
 231
 232 //===---------------------------------------------------------------------===//
 233
 234 Legalize should lower ctlz like this:
 235   ctlz(x) = popcnt((x-1) & ~x)
 236
 237 on targets that have popcnt but not ctlz.  itanium, what else?
 238
 239 //===---------------------------------------------------------------------===//
 240
 241 quantum_sigma_x in 462.libquantum contains the following loop:
 242
 243       for(i=0; i<reg->size; i++)
 244         {
 245           /* Flip the target bit of each basis state */
 246           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 247         }
 248
 249 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 250 so cool to turn it into something like:
 251
 252    long long Res = ((MAX_UNSIGNED) 1 << target);
 253    if (target < 32) {
 254      for(i=0; i<reg->size; i++)
 255        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 256    } else {
 257      for(i=0; i<reg->size; i++)
 258        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 259    }
 260
 261 ... which would only do one 32-bit XOR per loop iteration instead of two.
 262
 263 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 264 alas...
 265
 266 //===---------------------------------------------------------------------===//
 267
 268 This isn't recognized as bswap by instcombine:
 269
 270 unsigned int swap_32(unsigned int v) {
 271   v = ((v & 0x00ff00ffU) << 8)  | ((v & 0xff00ff00U) >> 8);
 272   v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16);
 273   return v;
 274 }
 275
 276 //===---------------------------------------------------------------------===//
 277
 278 These should turn into single 16-bit (unaligned?) loads on little/big endian
 279 processors.
 280
 281 unsigned short read_16_le(const unsigned char *adr) {
 282   return adr[0] | (adr[1] << 8);
 283 }
 284 unsigned short read_16_be(const unsigned char *adr) {
 285   return (adr[0] << 8) | adr[1];
 286 }
 287
 288 //===---------------------------------------------------------------------===//