lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 //===---------------------------------------------------------------------===//
  22
  23 This should be one DIV/IDIV instruction, not a libcall:
  24
  25 unsigned test(unsigned long long X, unsigned Y) {
  26         return X/Y;
  27 }
  28
  29 This can be done trivially with a custom legalizer.  What about overflow
  30 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Some targets (e.g. athlons) prefer freep to fstp ST(0):
  35 http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 This should use fiadd on chips where it is profitable:
  40 double foo(double P, int *I) { return P+*I; }
  41
  42 We have fiadd patterns now but the followings have the same cost and
  43 complexity. We need a way to specify the later is more profitable.
  44
  45 def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
  46                     [(set RFP:$dst, (fadd RFP:$src1,
  47                                      (extloadf64f32 addr:$src2)))]>;
  48                 // ST(0) = ST(0) + [mem32]
  49
  50 def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
  51                     [(set RFP:$dst, (fadd RFP:$src1,
  52                                      (X86fild addr:$src2, i32)))]>;
  53                 // ST(0) = ST(0) + [mem32int]
  54
  55 //===---------------------------------------------------------------------===//
  56
  57 The FP stackifier needs to be global.  Also, it should handle simple permutates
  58 to reduce number of shuffle instructions, e.g. turning:
  59
  60 fld P   ->              fld Q
  61 fld Q                   fld P
  62 fxch
  63
  64 or:
  65
  66 fxch    ->              fucomi
  67 fucomi                  jl X
  68 jg X
  69
  70 Ideas:
  71 http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
  72
  73
  74 //===---------------------------------------------------------------------===//
  75
  76 Improvements to the multiply -> shift/add algorithm:
  77 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  78
  79 //===---------------------------------------------------------------------===//
  80
  81 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  82 long long foo(int x) { return 1LL << x; }
  83
  84 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  85 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  86 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  87
  88 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 Compile this:
  93 _Bool f(_Bool a) { return a!=1; }
  94
  95 into:
  96         movzbl  %dil, %eax
  97         xorl    $1, %eax
  98         ret
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Some isel ideas:
 103
 104 1. Dynamic programming based approach when compile time if not an
 105    issue.
 106 2. Code duplication (addressing mode) during isel.
 107 3. Other ideas from "Register-Sensitive Selection, Duplication, and
 108    Sequencing of Instructions".
 109 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
 110    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
 111    and other related papers.
 112    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 Should we promote i16 to i32 to avoid partial register update stalls?
 117
 118 //===---------------------------------------------------------------------===//
 119
 120 Leave any_extend as pseudo instruction and hint to register
 121 allocator. Delay codegen until post register allocation.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Add a target specific hook to DAG combiner to handle SINT_TO_FP and
 126 FP_TO_SINT when the source operand is already in memory.
 127
 128 //===---------------------------------------------------------------------===//
 129
 130 Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
 131
 132         cmpl $1, %eax
 133         setg %al
 134         testb %al, %al  # unnecessary
 135         jne .BB7
 136
 137 //===---------------------------------------------------------------------===//
 138
 139 Count leading zeros and count trailing zeros:
 140
 141 int clz(int X) { return __builtin_clz(X); }
 142 int ctz(int X) { return __builtin_ctz(X); }
 143
 144 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 145 clz:
 146         bsr     %eax, DWORD PTR [%esp+4]
 147         xor     %eax, 31
 148         ret
 149 ctz:
 150         bsf     %eax, DWORD PTR [%esp+4]
 151         ret
 152
 153 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 154 aren't.
 155
 156 //===---------------------------------------------------------------------===//
 157
 158 Use push/pop instructions in prolog/epilog sequences instead of stores off
 159 ESP (certain code size win, perf win on some [which?] processors).
 160
 161 //===---------------------------------------------------------------------===//
 162
 163 Only use inc/neg/not instructions on processors where they are faster than
 164 add/sub/xor.  They are slower on the P4 due to only updating some processor
 165 flags.
 166
 167 //===---------------------------------------------------------------------===//
 168
 169 Open code rint,floor,ceil,trunc:
 170 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
 171 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
 172
 173 //===---------------------------------------------------------------------===//
 174
 175 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 176
 177 Expand these to calls of sin/cos and stores:
 178       double sincos(double x, double *sin, double *cos);
 179       float sincosf(float x, float *sin, float *cos);
 180       long double sincosl(long double x, long double *sin, long double *cos);
 181
 182 Doing so could allow SROA of the destination pointers.  See also:
 183 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 The instruction selector sometimes misses folding a load into a compare.  The
 188 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 189 commutative, it is not matched with the load on both sides.  The dag combiner
 190 should be made smart enough to cannonicalize the load into the RHS of a compare
 191 when it can invert the result of the compare for free.
 192
 193 //===---------------------------------------------------------------------===//
 194
 195 LSR should be turned on for the X86 backend and tuned to take advantage of its
 196 addressing modes.
 197
 198 //===---------------------------------------------------------------------===//
 199
 200 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
 201 other fast SSE modes.
 202
 203 //===---------------------------------------------------------------------===//
 204
 205 Think about doing i64 math in SSE regs.
 206
 207 //===---------------------------------------------------------------------===//
 208
 209 The DAG Isel doesn't fold the loads into the adds in this testcase.  The
 210 pattern selector does.  This is because the chain value of the load gets
 211 selected first, and the loads aren't checking to see if they are only used by
 212 and add.
 213
 214 .ll:
 215
 216 int %test(int* %x, int* %y, int* %z) {
 217         %X = load int* %x
 218         %Y = load int* %y
 219         %Z = load int* %z
 220         %a = add int %X, %Y
 221         %b = add int %a, %Z
 222         ret int %b
 223 }
 224
 225 dag isel:
 226
 227 _test:
 228         movl 4(%esp), %eax
 229         movl (%eax), %eax
 230         movl 8(%esp), %ecx
 231         movl (%ecx), %ecx
 232         addl %ecx, %eax
 233         movl 12(%esp), %ecx
 234         movl (%ecx), %ecx
 235         addl %ecx, %eax
 236         ret
 237
 238 pattern isel:
 239
 240 _test:
 241         movl 12(%esp), %ecx
 242         movl 4(%esp), %edx
 243         movl 8(%esp), %eax
 244         movl (%eax), %eax
 245         addl (%edx), %eax
 246         addl (%ecx), %eax
 247         ret
 248
 249 This is bad for register pressure, though the dag isel is producing a
 250 better schedule. :)
 251
 252 //===---------------------------------------------------------------------===//
 253
 254 This testcase should have no SSE instructions in it, and only one load from
 255 a constant pool:
 256
 257 double %test3(bool %B) {
 258         %C = select bool %B, double 123.412, double 523.01123123
 259         ret double %C
 260 }
 261
 262 Currently, the select is being lowered, which prevents the dag combiner from
 263 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
 264
 265 The pattern isel got this one right.
 266
 267 //===---------------------------------------------------------------------===//
 268
 269 We need to lower switch statements to tablejumps when appropriate instead of
 270 always into binary branch trees.
 271
 272 //===---------------------------------------------------------------------===//
 273
 274 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
 275 like this:
 276
 277   X += y
 278
 279 and the register allocator decides to spill X, it is cheaper to emit this as:
 280
 281 Y += [xslot]
 282 store Y -> [xslot]
 283
 284 than as:
 285
 286 tmp = [xslot]
 287 tmp += y
 288 store tmp -> [xslot]
 289
 290 ..and this uses one fewer register (so this should be done at load folding
 291 time, not at spiller time).  *Note* however that this can only be done
 292 if Y is dead.  Here's a testcase:
 293
 294 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
 295 implementation   ; Functions:
 296 declare void %printf(int, ...)
 297 void %main() {
 298 build_tree.exit:
 299         br label %no_exit.i7
 300 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
 301         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
 302         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
 303         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
 304         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
 305         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
 306 Compute_Tree.exit23:            ; preds = %no_exit.i7
 307         tail call void (int, ...)* %printf( int 0 )
 308         store double %tmp.34.i18, double* null
 309         ret void
 310 }
 311
 312 We currently emit:
 313
 314 .BBmain_1:
 315         xorpd %XMM1, %XMM1
 316         addsd %XMM0, %XMM1
 317 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
 318 ***     addsd %XMM2, %XMM1
 319 ***     movsd QWORD PTR [%ESP + 8], %XMM2
 320         jmp .BBmain_1   # no_exit.i7
 321
 322 This is a bugpoint reduced testcase, which is why the testcase doesn't make
 323 much sense (e.g. its an infinite loop). :)
 324
 325 //===---------------------------------------------------------------------===//
 326
 327 None of the FPStack instructions are handled in
 328 X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
 329 folding spill code into the instructions.
 330
 331 //===---------------------------------------------------------------------===//
 332
 333 In many cases, LLVM generates code like this:
 334
 335 _test:
 336         movl 8(%esp), %eax
 337         cmpl %eax, 4(%esp)
 338         setl %al
 339         movzbl %al, %eax
 340         ret
 341
 342 on some processors (which ones?), it is more efficient to do this:
 343
 344 _test:
 345         movl 8(%esp), %ebx
 346         xor %eax, %eax
 347         cmpl %ebx, 4(%esp)
 348         setl %al
 349         ret
 350
 351 Doing this correctly is tricky though, as the xor clobbers the flags.
 352
 353 //===---------------------------------------------------------------------===//
 354
 355 We should generate 'test' instead of 'cmp' in various cases, e.g.:
 356
 357 bool %test(int %X) {
 358         %Y = shl int %X, ubyte 1
 359         %C = seteq int %Y, 0
 360         ret bool %C
 361 }
 362 bool %test(int %X) {
 363         %Y = and int %X, 8
 364         %C = seteq int %Y, 0
 365         ret bool %C
 366 }
 367
 368 This may just be a matter of using 'test' to write bigger patterns for X86cmp.
 369
 370 //===---------------------------------------------------------------------===//
 371
 372 SSE should implement 'select_cc' using 'emulated conditional moves' that use
 373 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 374
 375 double %X(double %Y, double %Z, double %A, double %B) {
 376         %C = setlt double %A, %B
 377         %z = add double %Z, 0.0    ;; select operand is not a load
 378         %D = select bool %C, double %Y, double %z
 379         ret double %D
 380 }
 381
 382 We currently emit:
 383
 384 _X:
 385         subl $12, %esp
 386         xorpd %xmm0, %xmm0
 387         addsd 24(%esp), %xmm0
 388         movsd 32(%esp), %xmm1
 389         movsd 16(%esp), %xmm2
 390         ucomisd 40(%esp), %xmm1
 391         jb LBB_X_2
 392 LBB_X_1:
 393         movsd %xmm0, %xmm2
 394 LBB_X_2:
 395         movsd %xmm2, (%esp)
 396         fldl (%esp)
 397         addl $12, %esp
 398         ret
 399
 400 //===---------------------------------------------------------------------===//
 401
 402 We should generate bts/btr/etc instructions on targets where they are cheap or
 403 when codesize is important.  e.g., for:
 404
 405 void setbit(int *target, int bit) {
 406     *target |= (1 << bit);
 407 }
 408 void clearbit(int *target, int bit) {
 409     *target &= ~(1 << bit);
 410 }
 411
 412 //===---------------------------------------------------------------------===//
 413
 414 Easy: Global addresses are not always allowed as immediates.  For this:
 415
 416 int dst = 0; int *ptr = 0;
 417 void foo() { ptr = &dst; }
 418
 419 we get this:
 420
 421 _foo:
 422         movl $_dst, %eax
 423         movl %eax, _ptr
 424         ret
 425
 426 When: "movl $_dst, _ptr" is sufficient.
 427
 428 //===---------------------------------------------------------------------===//
 429
 430 Instead of the following for memset char*, 1, 10:
 431
 432         movl $16843009, 4(%edx)
 433         movl $16843009, (%edx)
 434         movw $257, 8(%edx)
 435
 436 It might be better to generate
 437
 438         movl $16843009, %eax
 439         movl %eax, 4(%edx)
 440         movl %eax, (%edx)
 441         movw al, 8(%edx)
 442
 443 when we can spare a register. It reduces code size.
 444
 445 //===---------------------------------------------------------------------===//
 446
 447 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 448 registers. The choice may depend on subtarget information. We should do some
 449 more experiments on different x86 machines.
 450
 451 //===---------------------------------------------------------------------===//
 452
 453 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 454 get this:
 455
 456 int %test1(int %X) {
 457         %Y = div int %X, 8
 458         ret int %Y
 459 }
 460
 461 _test1:
 462         movl 4(%esp), %eax
 463         movl %eax, %ecx
 464         sarl $31, %ecx
 465         shrl $29, %ecx
 466         addl %ecx, %eax
 467         sarl $3, %eax
 468         ret
 469
 470 GCC knows several different ways to codegen it, one of which is this:
 471
 472 _test1:
 473         movl    4(%esp), %eax
 474         cmpl    $-1, %eax
 475         leal    7(%eax), %ecx
 476         cmovle  %ecx, %eax
 477         sarl    $3, %eax
 478         ret
 479
 480 which is probably slower, but it's interesting at least :)
 481
 482 //===---------------------------------------------------------------------===//
 483
 484 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 485 code:
 486
 487 unsigned int foo(double x) { return x; }
 488
 489 foo:
 490         subl $20, %esp
 491         movsd 24(%esp), %xmm0
 492         movsd %xmm0, 8(%esp)
 493         fldl 8(%esp)
 494         fisttpll (%esp)
 495         movl (%esp), %eax
 496         addl $20, %esp
 497         ret
 498
 499 This will be solved when we go to a dynamic programming based isel.