lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 //===---------------------------------------------------------------------===//
  22
  23 This should be one DIV/IDIV instruction, not a libcall:
  24
  25 unsigned test(unsigned long long X, unsigned Y) {
  26         return X/Y;
  27 }
  28
  29 This can be done trivially with a custom legalizer.  What about overflow
  30 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Some targets (e.g. athlons) prefer freep to fstp ST(0):
  35 http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 This should use fiadd on chips where it is profitable:
  40 double foo(double P, int *I) { return P+*I; }
  41
  42 //===---------------------------------------------------------------------===//
  43
  44 The FP stackifier needs to be global.  Also, it should handle simple permutates
  45 to reduce number of shuffle instructions, e.g. turning:
  46
  47 fld P   ->              fld Q
  48 fld Q                   fld P
  49 fxch
  50
  51 or:
  52
  53 fxch    ->              fucomi
  54 fucomi                  jl X
  55 jg X
  56
  57 Ideas:
  58 http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
  59
  60
  61 //===---------------------------------------------------------------------===//
  62
  63 Improvements to the multiply -> shift/add algorithm:
  64 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  65
  66 //===---------------------------------------------------------------------===//
  67
  68 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  69 long long foo(int x) { return 1LL << x; }
  70
  71 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  72 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  73 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  74
  75 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  76
  77 //===---------------------------------------------------------------------===//
  78
  79 Should support emission of the bswap instruction, probably by adding a new
  80 DAG node for byte swapping.  Also useful on PPC which has byte-swapping loads.
  81
  82 //===---------------------------------------------------------------------===//
  83
  84 Compile this:
  85 _Bool f(_Bool a) { return a!=1; }
  86
  87 into:
  88         movzbl  %dil, %eax
  89         xorl    $1, %eax
  90         ret
  91
  92 //===---------------------------------------------------------------------===//
  93
  94 Some isel ideas:
  95
  96 1. Dynamic programming based approach when compile time if not an
  97    issue.
  98 2. Code duplication (addressing mode) during isel.
  99 3. Other ideas from "Register-Sensitive Selection, Duplication, and
 100    Sequencing of Instructions".
 101
 102 //===---------------------------------------------------------------------===//
 103
 104 Should we promote i16 to i32 to avoid partial register update stalls?
 105
 106 //===---------------------------------------------------------------------===//
 107
 108 Leave any_extend as pseudo instruction and hint to register
 109 allocator. Delay codegen until post register allocation.
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 Add a target specific hook to DAG combiner to handle SINT_TO_FP and
 114 FP_TO_SINT when the source operand is already in memory.
 115
 116 //===---------------------------------------------------------------------===//
 117
 118 Check if load folding would add a cycle in the dag.
 119
 120 //===---------------------------------------------------------------------===//
 121
 122 Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
 123
 124         cmpl $1, %eax
 125         setg %al
 126         testb %al, %al  # unnecessary
 127         jne .BB7
 128
 129 //===---------------------------------------------------------------------===//
 130
 131 Count leading zeros and count trailing zeros:
 132
 133 int clz(int X) { return __builtin_clz(X); }
 134 int ctz(int X) { return __builtin_ctz(X); }
 135
 136 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 137 clz:
 138         bsr     %eax, DWORD PTR [%esp+4]
 139         xor     %eax, 31
 140         ret
 141 ctz:
 142         bsf     %eax, DWORD PTR [%esp+4]
 143         ret
 144
 145 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 146 aren't.
 147
 148 //===---------------------------------------------------------------------===//
 149
 150 Use push/pop instructions in prolog/epilog sequences instead of stores off
 151 ESP (certain code size win, perf win on some [which?] processors).
 152
 153 //===---------------------------------------------------------------------===//
 154
 155 Only use inc/neg/not instructions on processors where they are faster than
 156 add/sub/xor.  They are slower on the P4 due to only updating some processor
 157 flags.
 158
 159 //===---------------------------------------------------------------------===//
 160
 161 Open code rint,floor,ceil,trunc:
 162 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
 163 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
 164
 165 //===---------------------------------------------------------------------===//
 166
 167 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 168