lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 We should make the following changes to clean up MachineInstr:
   6
   7 1. Add an Opcode field to TargetInstrDescriptor, so you can tell the opcode of
   8    an instruction with just a TargetInstrDescriptor*.
   9 2. Remove the Opcode field from MachineInstr, replacing it with a
  10    TargetInstrDescriptor*.
  11 3. Getting information about a machine instr then becomes:
  12      MI->getInfo()->isTwoAddress()
  13    instead of:
  14      const TargetInstrInfo &TII = ...
  15      TII.isTwoAddrInstr(MI->getOpcode())
  16
  17 //===---------------------------------------------------------------------===//
  18
  19 FreeBench/mason contains code like this:
  20
  21 static p_type m0u(p_type p) {
  22   int m[]={0, 8, 1, 2, 16, 5, 13, 7, 14, 9, 3, 4, 11, 12, 15, 10, 17, 6};
  23   p_type pu;
  24   pu.a = m[p.a];
  25   pu.b = m[p.b];
  26   pu.c = m[p.c];
  27   return pu;
  28 }
  29
  30 We currently compile this into a memcpy from a static array into 'm', then
  31 a bunch of loads from m.  It would be better to avoid the memcpy and just do
  32 loads from the static array.
  33
  34 //===---------------------------------------------------------------------===//
  35
  36 Make the PPC branch selector target independant
  37
  38 //===---------------------------------------------------------------------===//
  39
  40 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  41 precision don't matter (ffastmath).  Misc/mandel will like this. :)
  42
  43 //===---------------------------------------------------------------------===//
  44
  45 Solve this DAG isel folding deficiency:
  46
  47 int X, Y;
  48
  49 void fn1(void)
  50 {
  51   X = X | (Y << 3);
  52 }
  53
  54 compiles to
  55
  56 fn1:
  57         movl Y, %eax
  58         shll $3, %eax
  59         orl X, %eax
  60         movl %eax, X
  61         ret
  62
  63 The problem is the store's chain operand is not the load X but rather
  64 a TokenFactor of the load X and load Y, which prevents the folding.
  65
  66 There are two ways to fix this:
  67
  68 1. The dag combiner can start using alias analysis to realize that y/x
  69    don't alias, making the store to X not dependent on the load from Y.
  70 2. The generated isel could be made smarter in the case it can't
  71    disambiguate the pointers.
  72
  73 Number 1 is the preferred solution.
  74
  75 This has been "fixed" by a TableGen hack. But that is a short term workaround
  76 which will be removed once the proper fix is made.
  77
  78 //===---------------------------------------------------------------------===//
  79
  80 On targets with expensive 64-bit multiply, we could LSR this:
  81
  82 for (i = ...; ++i) {
  83    x = 1ULL << i;
  84
  85 into:
  86  long long tmp = 1;
  87  for (i = ...; ++i, tmp+=tmp)
  88    x = tmp;
  89
  90 This would be a win on ppc32, but not x86 or ppc64.
  91
  92 //===---------------------------------------------------------------------===//
  93
  94 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  95
  96 //===---------------------------------------------------------------------===//
  97
  98 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Interesting? testcase for add/shift/mul reassoc:
 103
 104 int bar(int x, int y) {
 105   return x*x*x+y+x*x*x*x*x*y*y*y*y;
 106 }
 107 int foo(int z, int n) {
 108   return bar(z, n) + bar(2*z, 2*n);
 109 }
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 These two functions should generate the same code on big-endian systems:
 114
 115 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 116 int h(int *j, int *l) {  return *j - *l; }
 117
 118 this could be done in SelectionDAGISel.cpp, along with other special cases,
 119 for 1,2,4,8 bytes.
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 This code:
 124 int rot(unsigned char b) { int a = ((b>>1) ^ (b<<7)) & 0xff; return a; }
 125
 126 Can be improved in two ways:
 127
 128 1. The instcombiner should eliminate the type conversions.
 129 2. The X86 backend should turn this into a rotate by one bit.
 130
 131 //===---------------------------------------------------------------------===//
 132
 133 Add LSR exit value substitution. It'll probably be a win for Ackermann, etc.
 134
 135 //===---------------------------------------------------------------------===//
 136
 137 It would be nice to revert this patch:
 138 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 139
 140 And teach the dag combiner enough to simplify the code expanded before
 141 legalize.  It seems plausible that this knowledge would let it simplify other
 142 stuff too.
 143
 144 //===---------------------------------------------------------------------===//
 145
 146 For packed types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 147 to the type size. It works but can be overly conservative as the alignment of
 148 specific packed types are target dependent.
 149
 150 //===---------------------------------------------------------------------===//
 151
 152 We should add 'unaligned load/store' nodes, and produce them from code like
 153 this:
 154
 155 v4sf example(float *P) {
 156   return (v4sf){P[0], P[1], P[2], P[3] };
 157 }
 158
 159 //===---------------------------------------------------------------------===//
 160
 161 We should constant fold packed type casts at the LLVM level, regardless of the
 162 cast.  Currently we cannot fold some casts because we don't have TargetData
 163 information in the constant folder, so we don't know the endianness of the
 164 target!
 165
 166 //===---------------------------------------------------------------------===//
 167
 168 Add support for conditional increments, and other related patterns.  Instead
 169 of:
 170
 171         movl 136(%esp), %eax
 172         cmpl $0, %eax
 173         je LBB16_2      #cond_next
 174 LBB16_1:        #cond_true
 175         incl _foo
 176 LBB16_2:        #cond_next
 177
 178 emit:
 179         movl    _foo, %eax
 180         cmpl    $1, %edi
 181         sbbl    $-1, %eax
 182         movl    %eax, _foo
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 187
 188 Expand these to calls of sin/cos and stores:
 189       double sincos(double x, double *sin, double *cos);
 190       float sincosf(float x, float *sin, float *cos);
 191       long double sincosl(long double x, long double *sin, long double *cos);
 192
 193 Doing so could allow SROA of the destination pointers.  See also:
 194 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 195
 196 //===---------------------------------------------------------------------===//
 197
 198 Scalar Repl cannot currently promote this testcase to 'ret long cst':
 199
 200         %struct.X = type { int, int }
 201         %struct.Y = type { %struct.X }
 202 ulong %bar() {
 203         %retval = alloca %struct.Y, align 8             ; <%struct.Y*> [#uses=3]
 204         %tmp12 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 0
 205         store int 0, int* %tmp12
 206         %tmp15 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 1
 207         store int 1, int* %tmp15
 208         %retval = cast %struct.Y* %retval to ulong*
 209         %retval = load ulong* %retval           ; <ulong> [#uses=1]
 210         ret ulong %retval
 211 }
 212
 213 it should be extended to do so.
 214
 215 //===---------------------------------------------------------------------===//
 216
 217 Turn this into a single byte store with no load (the other 3 bytes are
 218 unmodified):
 219
 220 void %test(uint* %P) {
 221         %tmp = load uint* %P
 222         %tmp14 = or uint %tmp, 3305111552
 223         %tmp15 = and uint %tmp14, 3321888767
 224         store uint %tmp15, uint* %P
 225         ret void
 226 }
 227
 228 //===---------------------------------------------------------------------===//
 229
 230 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 231
 232 Compile:
 233
 234 int bar(int x)
 235 {
 236   int t = __builtin_clz(x);
 237   return -(t>>5);
 238 }
 239
 240 to:
 241
 242 _bar:   addic r3,r3,-1
 243         subfe r3,r3,r3
 244         blr
 245
 246 //===---------------------------------------------------------------------===//
 247
 248 Legalize should lower ctlz like this:
 249   ctlz(x) = popcnt((x-1) & ~x)
 250
 251 on targets that have popcnt but not ctlz.  itanium, what else?
 252
 253 //===---------------------------------------------------------------------===//
 254
 255 quantum_sigma_x in 462.libquantum contains the following loop:
 256
 257       for(i=0; i<reg->size; i++)
 258         {
 259           /* Flip the target bit of each basis state */
 260           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 261         }
 262
 263 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 264 so cool to turn it into something like:
 265
 266    long long Res = ((MAX_UNSIGNED) 1 << target);
 267    if (target < 32) {
 268      for(i=0; i<reg->size; i++)
 269        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 270    } else {
 271      for(i=0; i<reg->size; i++)
 272        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 273    }
 274
 275 ... which would only do one 32-bit XOR per loop iteration instead of two.
 276
 277 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 278 alas...
 279
 280 //===---------------------------------------------------------------------===//
 281
 282 This isn't recognized as bswap by instcombine:
 283
 284 unsigned int swap_32(unsigned int v) {
 285   v = ((v & 0x00ff00ffU) << 8)  | ((v & 0xff00ff00U) >> 8);
 286   v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16);
 287   return v;
 288 }
 289
 290 //===---------------------------------------------------------------------===//
 291
 292 These should turn into single 16-bit (unaligned?) loads on little/big endian
 293 processors.
 294
 295 unsigned short read_16_le(const unsigned char *adr) {
 296   return adr[0] | (adr[1] << 8);
 297 }
 298 unsigned short read_16_be(const unsigned char *adr) {
 299   return (adr[0] << 8) | adr[1];
 300 }
 301
 302 //===---------------------------------------------------------------------===//