lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 This can be done with a custom expander, but it would be nice to move this to
  22 generic code.
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 This should be one DIV/IDIV instruction, not a libcall:
  27
  28 unsigned test(unsigned long long X, unsigned Y) {
  29         return X/Y;
  30 }
  31
  32 This can be done trivially with a custom legalizer.  What about overflow
  33 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improvements to the multiply -> shift/add algorithm:
  38 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  39
  40 //===---------------------------------------------------------------------===//
  41
  42 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  43 long long foo(int x) { return 1LL << x; }
  44
  45 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  46 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  47 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  48
  49 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  50
  51 One better solution for 1LL << x is:
  52         xorl    %eax, %eax
  53         xorl    %edx, %edx
  54         testb   $32, %cl
  55         sete    %al
  56         setne   %dl
  57         sall    %cl, %eax
  58         sall    %cl, %edx
  59
  60 But that requires good 8-bit subreg support.
  61
  62 64-bit shifts (in general) expand to really bad code.  Instead of using
  63 cmovs, we should expand to a conditional branch like GCC produces.
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Compile this:
  68 _Bool f(_Bool a) { return a!=1; }
  69
  70 into:
  71         movzbl  %dil, %eax
  72         xorl    $1, %eax
  73         ret
  74
  75 //===---------------------------------------------------------------------===//
  76
  77 Some isel ideas:
  78
  79 1. Dynamic programming based approach when compile time if not an
  80    issue.
  81 2. Code duplication (addressing mode) during isel.
  82 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  83    Sequencing of Instructions".
  84 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  85    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  86    and other related papers.
  87    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 Should we promote i16 to i32 to avoid partial register update stalls?
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 Leave any_extend as pseudo instruction and hint to register
  96 allocator. Delay codegen until post register allocation.
  97
  98 //===---------------------------------------------------------------------===//
  99
 100 Count leading zeros and count trailing zeros:
 101
 102 int clz(int X) { return __builtin_clz(X); }
 103 int ctz(int X) { return __builtin_ctz(X); }
 104
 105 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 106 clz:
 107         bsr     %eax, DWORD PTR [%esp+4]
 108         xor     %eax, 31
 109         ret
 110 ctz:
 111         bsf     %eax, DWORD PTR [%esp+4]
 112         ret
 113
 114 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 115 aren't.
 116
 117 Another example (use predsimplify to eliminate a select):
 118
 119 int foo (unsigned long j) {
 120   if (j)
 121     return __builtin_ffs (j) - 1;
 122   else
 123     return 0;
 124 }
 125
 126 //===---------------------------------------------------------------------===//
 127
 128 Use push/pop instructions in prolog/epilog sequences instead of stores off
 129 ESP (certain code size win, perf win on some [which?] processors).
 130 Also, it appears icc use push for parameter passing. Need to investigate.
 131
 132 //===---------------------------------------------------------------------===//
 133
 134 Only use inc/neg/not instructions on processors where they are faster than
 135 add/sub/xor.  They are slower on the P4 due to only updating some processor
 136 flags.
 137
 138 //===---------------------------------------------------------------------===//
 139
 140 The instruction selector sometimes misses folding a load into a compare.  The
 141 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 142 commutative, it is not matched with the load on both sides.  The dag combiner
 143 should be made smart enough to cannonicalize the load into the RHS of a compare
 144 when it can invert the result of the compare for free.
 145
 146 //===---------------------------------------------------------------------===//
 147
 148 How about intrinsics? An example is:
 149   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 150
 151 compiles to
 152         pmuludq (%eax), %xmm0
 153         movl 8(%esp), %eax
 154         movdqa (%eax), %xmm1
 155         pmulhuw %xmm0, %xmm1
 156
 157 The transformation probably requires a X86 specific pass or a DAG combiner
 158 target specific hook.
 159
 160 //===---------------------------------------------------------------------===//
 161
 162 In many cases, LLVM generates code like this:
 163
 164 _test:
 165         movl 8(%esp), %eax
 166         cmpl %eax, 4(%esp)
 167         setl %al
 168         movzbl %al, %eax
 169         ret
 170
 171 on some processors (which ones?), it is more efficient to do this:
 172
 173 _test:
 174         movl 8(%esp), %ebx
 175         xor  %eax, %eax
 176         cmpl %ebx, 4(%esp)
 177         setl %al
 178         ret
 179
 180 Doing this correctly is tricky though, as the xor clobbers the flags.
 181
 182 //===---------------------------------------------------------------------===//
 183
 184 We should generate bts/btr/etc instructions on targets where they are cheap or
 185 when codesize is important.  e.g., for:
 186
 187 void setbit(int *target, int bit) {
 188     *target |= (1 << bit);
 189 }
 190 void clearbit(int *target, int bit) {
 191     *target &= ~(1 << bit);
 192 }
 193
 194 //===---------------------------------------------------------------------===//
 195
 196 Instead of the following for memset char*, 1, 10:
 197
 198         movl $16843009, 4(%edx)
 199         movl $16843009, (%edx)
 200         movw $257, 8(%edx)
 201
 202 It might be better to generate
 203
 204         movl $16843009, %eax
 205         movl %eax, 4(%edx)
 206         movl %eax, (%edx)
 207         movw al, 8(%edx)
 208
 209 when we can spare a register. It reduces code size.
 210
 211 //===---------------------------------------------------------------------===//
 212
 213 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 214 get this:
 215
 216 int %test1(int %X) {
 217         %Y = div int %X, 8
 218         ret int %Y
 219 }
 220
 221 _test1:
 222         movl 4(%esp), %eax
 223         movl %eax, %ecx
 224         sarl $31, %ecx
 225         shrl $29, %ecx
 226         addl %ecx, %eax
 227         sarl $3, %eax
 228         ret
 229
 230 GCC knows several different ways to codegen it, one of which is this:
 231
 232 _test1:
 233         movl    4(%esp), %eax
 234         cmpl    $-1, %eax
 235         leal    7(%eax), %ecx
 236         cmovle  %ecx, %eax
 237         sarl    $3, %eax
 238         ret
 239
 240 which is probably slower, but it's interesting at least :)
 241
 242 //===---------------------------------------------------------------------===//
 243
 244 The first BB of this code:
 245
 246 declare bool %foo()
 247 int %bar() {
 248         %V = call bool %foo()
 249         br bool %V, label %T, label %F
 250 T:
 251         ret int 1
 252 F:
 253         call bool %foo()
 254         ret int 12
 255 }
 256
 257 compiles to:
 258
 259 _bar:
 260         subl $12, %esp
 261         call L_foo$stub
 262         xorb $1, %al
 263         testb %al, %al
 264         jne LBB_bar_2   # F
 265
 266 It would be better to emit "cmp %al, 1" than a xor and test.
 267
 268 //===---------------------------------------------------------------------===//
 269
 270 Enable X86InstrInfo::convertToThreeAddress().
 271
 272 //===---------------------------------------------------------------------===//
 273
 274 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 275 We should leave these as libcalls for everything over a much lower threshold,
 276 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 277 stores, TLB preheating, etc)
 278
 279 //===---------------------------------------------------------------------===//
 280
 281 Optimize this into something reasonable:
 282  x * copysign(1.0, y) * copysign(1.0, z)
 283
 284 //===---------------------------------------------------------------------===//
 285
 286 Optimize copysign(x, *y) to use an integer load from y.
 287
 288 //===---------------------------------------------------------------------===//
 289
 290 %X = weak global int 0
 291
 292 void %foo(int %N) {
 293         %N = cast int %N to uint
 294         %tmp.24 = setgt int %N, 0
 295         br bool %tmp.24, label %no_exit, label %return
 296
 297 no_exit:
 298         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 299         %i.0.0 = cast uint %indvar to int
 300         volatile store int %i.0.0, int* %X
 301         %indvar.next = add uint %indvar, 1
 302         %exitcond = seteq uint %indvar.next, %N
 303         br bool %exitcond, label %return, label %no_exit
 304
 305 return:
 306         ret void
 307 }
 308
 309 compiles into:
 310
 311         .text
 312         .align  4
 313         .globl  _foo
 314 _foo:
 315         movl 4(%esp), %eax
 316         cmpl $1, %eax
 317         jl LBB_foo_4    # return
 318 LBB_foo_1:      # no_exit.preheader
 319         xorl %ecx, %ecx
 320 LBB_foo_2:      # no_exit
 321         movl L_X$non_lazy_ptr, %edx
 322         movl %ecx, (%edx)
 323         incl %ecx
 324         cmpl %eax, %ecx
 325         jne LBB_foo_2   # no_exit
 326 LBB_foo_3:      # return.loopexit
 327 LBB_foo_4:      # return
 328         ret
 329
 330 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 331 remateralization is implemented. This can be accomplished with 1) a target
 332 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 333
 334 //===---------------------------------------------------------------------===//
 335
 336 The following tests perform worse with LSR:
 337
 338 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 339
 340 //===---------------------------------------------------------------------===//
 341
 342 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 343 FR64 to VR128.
 344
 345 //===---------------------------------------------------------------------===//
 346
 347 mov $reg, 48(%esp)
 348 ...
 349 leal 48(%esp), %eax
 350 mov %eax, (%esp)
 351 call _foo
 352
 353 Obviously it would have been better for the first mov (or any op) to store
 354 directly %esp[0] if there are no other uses.
 355
 356 //===---------------------------------------------------------------------===//
 357
 358 Adding to the list of cmp / test poor codegen issues:
 359
 360 int test(__m128 *A, __m128 *B) {
 361   if (_mm_comige_ss(*A, *B))
 362     return 3;
 363   else
 364     return 4;
 365 }
 366
 367 _test:
 368         movl 8(%esp), %eax
 369         movaps (%eax), %xmm0
 370         movl 4(%esp), %eax
 371         movaps (%eax), %xmm1
 372         comiss %xmm0, %xmm1
 373         setae %al
 374         movzbl %al, %ecx
 375         movl $3, %eax
 376         movl $4, %edx
 377         cmpl $0, %ecx
 378         cmove %edx, %eax
 379         ret
 380
 381 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 382 are a number of issues. 1) We are introducing a setcc between the result of the
 383 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 384 so a any extend (which becomes a zero extend) is added.
 385
 386 We probably need some kind of target DAG combine hook to fix this.
 387
 388 //===---------------------------------------------------------------------===//
 389
 390 We generate significantly worse code for this than GCC:
 391 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 392 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 393
 394 There is also one case we do worse on PPC.
 395
 396 //===---------------------------------------------------------------------===//
 397
 398 If shorter, we should use things like:
 399 movzwl %ax, %eax
 400 instead of:
 401 andl $65535, %EAX
 402
 403 The former can also be used when the two-addressy nature of the 'and' would
 404 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 405
 406 //===---------------------------------------------------------------------===//
 407
 408 Bad codegen:
 409
 410 char foo(int x) { return x; }
 411
 412 _foo:
 413         movl 4(%esp), %eax
 414         shll $24, %eax
 415         sarl $24, %eax
 416         ret
 417
 418 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 419 sub-registers.
 420
 421 //===---------------------------------------------------------------------===//
 422
 423 Consider this:
 424
 425 typedef struct pair { float A, B; } pair;
 426 void pairtest(pair P, float *FP) {
 427         *FP = P.A+P.B;
 428 }
 429
 430 We currently generate this code with llvmgcc4:
 431
 432 _pairtest:
 433         movl 8(%esp), %eax
 434         movl 4(%esp), %ecx
 435         movd %eax, %xmm0
 436         movd %ecx, %xmm1
 437         addss %xmm0, %xmm1
 438         movl 12(%esp), %eax
 439         movss %xmm1, (%eax)
 440         ret
 441
 442 we should be able to generate:
 443 _pairtest:
 444         movss 4(%esp), %xmm0
 445         movl 12(%esp), %eax
 446         addss 8(%esp), %xmm0
 447         movss %xmm0, (%eax)
 448         ret
 449
 450 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 451 integer chunks.  It does this so that structs like {short,short} are passed in
 452 a single 32-bit integer stack slot.  We should handle the safe cases above much
 453 nicer, while still handling the hard cases.
 454
 455 While true in general, in this specific case we could do better by promoting
 456 load int + bitcast to float -> load fload.  This basically needs alignment info,
 457 the code is already implemented (but disabled) in dag combine).
 458
 459 //===---------------------------------------------------------------------===//
 460
 461 Another instruction selector deficiency:
 462
 463 void %bar() {
 464         %tmp = load int (int)** %foo
 465         %tmp = tail call int %tmp( int 3 )
 466         ret void
 467 }
 468
 469 _bar:
 470         subl $12, %esp
 471         movl L_foo$non_lazy_ptr, %eax
 472         movl (%eax), %eax
 473         call *%eax
 474         addl $12, %esp
 475         ret
 476
 477 The current isel scheme will not allow the load to be folded in the call since
 478 the load's chain result is read by the callseq_start.
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 Don't forget to find a way to squash noop truncates in the JIT environment.
 483
 484 //===---------------------------------------------------------------------===//
 485
 486 Implement anyext in the same manner as truncate that would allow them to be
 487 eliminated.
 488
 489 //===---------------------------------------------------------------------===//
 490
 491 How about implementing truncate / anyext as a property of machine instruction
 492 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 493 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 494 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 495
 496 //===---------------------------------------------------------------------===//
 497
 498 For this:
 499
 500 int test(int a)
 501 {
 502   return a * 3;
 503 }
 504
 505 We currently emits
 506         imull $3, 4(%esp), %eax
 507
 508 Perhaps this is what we really should generate is? Is imull three or four
 509 cycles? Note: ICC generates this:
 510         movl    4(%esp), %eax
 511         leal    (%eax,%eax,2), %eax
 512
 513 The current instruction priority is based on pattern complexity. The former is
 514 more "complex" because it folds a load so the latter will not be emitted.
 515
 516 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 517 should always try to match LEA first since the LEA matching code does some
 518 estimate to determine whether the match is profitable.
 519
 520 However, if we care more about code size, then imull is better. It's two bytes
 521 shorter than movl + leal.
 522
 523 //===---------------------------------------------------------------------===//
 524
 525 Implement CTTZ, CTLZ with bsf and bsr.
 526
 527 //===---------------------------------------------------------------------===//
 528
 529 It appears gcc place string data with linkonce linkage in
 530 .section __TEXT,__const_coal,coalesced instead of
 531 .section __DATA,__const_coal,coalesced.
 532 Take a look at darwin.h, there are other Darwin assembler directives that we
 533 do not make use of.
 534
 535 //===---------------------------------------------------------------------===//
 536
 537 int %foo(int* %a, int %t) {
 538 entry:
 539         br label %cond_true
 540
 541 cond_true:              ; preds = %cond_true, %entry
 542         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 543         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 544         %tmp2 = getelementptr int* %a, int %x.0.0
 545         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 546         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 547         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 548         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 549         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 550         br bool %tmp, label %bb12, label %cond_true
 551
 552 bb12:           ; preds = %cond_true
 553         ret int %tmp7
 554 }
 555
 556 is pessimized by -loop-reduce and -indvars
 557
 558 //===---------------------------------------------------------------------===//
 559
 560 u32 to float conversion improvement:
 561
 562 float uint32_2_float( unsigned u ) {
 563   float fl = (int) (u & 0xffff);
 564   float fh = (int) (u >> 16);
 565   fh *= 0x1.0p16f;
 566   return fh + fl;
 567 }
 568
 569 00000000        subl    $0x04,%esp
 570 00000003        movl    0x08(%esp,1),%eax
 571 00000007        movl    %eax,%ecx
 572 00000009        shrl    $0x10,%ecx
 573 0000000c        cvtsi2ss        %ecx,%xmm0
 574 00000010        andl    $0x0000ffff,%eax
 575 00000015        cvtsi2ss        %eax,%xmm1
 576 00000019        mulss   0x00000078,%xmm0
 577 00000021        addss   %xmm1,%xmm0
 578 00000025        movss   %xmm0,(%esp,1)
 579 0000002a        flds    (%esp,1)
 580 0000002d        addl    $0x04,%esp
 581 00000030        ret
 582
 583 //===---------------------------------------------------------------------===//
 584
 585 When using fastcc abi, align stack slot of argument of type double on 8 byte
 586 boundary to improve performance.
 587
 588 //===---------------------------------------------------------------------===//
 589
 590 Codegen:
 591
 592 int f(int a, int b) {
 593   if (a == 4 || a == 6)
 594     b++;
 595   return b;
 596 }
 597
 598
 599 as:
 600
 601 or eax, 2
 602 cmp eax, 6
 603 jz label
 604
 605 //===---------------------------------------------------------------------===//
 606
 607 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 608 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 609
 610 int G;
 611 void f(int X, int Y) {
 612   G = X < 0 ? 14 : 13;
 613 }
 614
 615 compiling to:
 616
 617 _f:
 618         movl $14, %eax
 619         movl $13, %ecx
 620         movl 4(%esp), %edx
 621         testl %edx, %edx
 622         cmovl %eax, %ecx
 623         movl %ecx, _G
 624         ret
 625
 626 it could be:
 627 _f:
 628         movl    4(%esp), %eax
 629         sarl    $31, %eax
 630         notl    %eax
 631         addl    $14, %eax
 632         movl    %eax, _G
 633         ret
 634
 635 etc.
 636
 637 //===---------------------------------------------------------------------===//
 638
 639 Currently we don't have elimination of redundant stack manipulations. Consider
 640 the code:
 641
 642 int %main() {
 643 entry:
 644         call fastcc void %test1( )
 645         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 646         ret int 0
 647 }
 648
 649 declare fastcc void %test1()
 650
 651 declare fastcc void %test2(sbyte*)
 652
 653
 654 This currently compiles to:
 655
 656         subl $16, %esp
 657         call _test5
 658         addl $12, %esp
 659         subl $16, %esp
 660         movl $_test5, (%esp)
 661         call _test6
 662         addl $12, %esp
 663
 664 The add\sub pair is really unneeded here.
 665
 666 //===---------------------------------------------------------------------===//
 667
 668 We currently compile sign_extend_inreg into two shifts:
 669
 670 long foo(long X) {
 671   return (long)(signed char)X;
 672 }
 673
 674 becomes:
 675
 676 _foo:
 677         movl 4(%esp), %eax
 678         shll $24, %eax
 679         sarl $24, %eax
 680         ret
 681
 682 This could be:
 683
 684 _foo:
 685         movsbl  4(%esp),%eax
 686         ret
 687
 688 //===---------------------------------------------------------------------===//
 689
 690 Consider the expansion of:
 691
 692 uint %test3(uint %X) {
 693         %tmp1 = rem uint %X, 255
 694         ret uint %tmp1
 695 }
 696
 697 Currently it compiles to:
 698
 699 ...
 700         movl $2155905153, %ecx
 701         movl 8(%esp), %esi
 702         movl %esi, %eax
 703         mull %ecx
 704 ...
 705
 706 This could be "reassociated" into:
 707
 708         movl $2155905153, %eax
 709         movl 8(%esp), %ecx
 710         mull %ecx
 711
 712 to avoid the copy.  In fact, the existing two-address stuff would do this
 713 except that mul isn't a commutative 2-addr instruction.  I guess this has
 714 to be done at isel time based on the #uses to mul?
 715
 716 //===---------------------------------------------------------------------===//
 717
 718 Make sure the instruction which starts a loop does not cross a cacheline
 719 boundary. This requires knowning the exact length of each machine instruction.
 720 That is somewhat complicated, but doable. Example 256.bzip2:
 721
 722 In the new trace, the hot loop has an instruction which crosses a cacheline
 723 boundary.  In addition to potential cache misses, this can't help decoding as I
 724 imagine there has to be some kind of complicated decoder reset and realignment
 725 to grab the bytes from the next cacheline.
 726
 727 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 728 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 729 937  937 0x3d0a incl     %esi
 730 3    3   0x3d0b cmpb     %bl, %dl
 731 27   27  0x3d0d jnz      0x000062db <main+11707>
 732
 733 //===---------------------------------------------------------------------===//
 734
 735 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 736
 737 //===---------------------------------------------------------------------===//
 738
 739 This could be a single 16-bit load.
 740
 741 int f(char *p) {
 742     if ((p[0] == 1) & (p[1] == 2)) return 1;
 743     return 0;
 744 }
 745
 746 //===---------------------------------------------------------------------===//
 747
 748 We should inline lrintf and probably other libc functions.
 749
 750 //===---------------------------------------------------------------------===//
 751
 752 Start using the flags more.  For example, compile:
 753
 754 int add_zf(int *x, int y, int a, int b) {
 755      if ((*x += y) == 0)
 756           return a;
 757      else
 758           return b;
 759 }
 760
 761 to:
 762        addl    %esi, (%rdi)
 763        movl    %edx, %eax
 764        cmovne  %ecx, %eax
 765        ret
 766 instead of:
 767
 768 _add_zf:
 769         addl (%rdi), %esi
 770         movl %esi, (%rdi)
 771         testl %esi, %esi
 772         cmove %edx, %ecx
 773         movl %ecx, %eax
 774         ret
 775
 776 and:
 777
 778 int add_zf(int *x, int y, int a, int b) {
 779      if ((*x + y) < 0)
 780           return a;
 781      else
 782           return b;
 783 }
 784
 785 to:
 786
 787 add_zf:
 788         addl    (%rdi), %esi
 789         movl    %edx, %eax
 790         cmovns  %ecx, %eax
 791         ret
 792
 793 instead of:
 794
 795 _add_zf:
 796         addl (%rdi), %esi
 797         testl %esi, %esi
 798         cmovs %edx, %ecx
 799         movl %ecx, %eax
 800         ret
 801
 802 //===---------------------------------------------------------------------===//
 803
 804 This:
 805 #include <math.h>
 806 int foo(double X) { return isnan(X); }
 807
 808 compiles to (-m64):
 809
 810 _foo:
 811         pxor %xmm1, %xmm1
 812         ucomisd %xmm1, %xmm0
 813         setp %al
 814         movzbl %al, %eax
 815         ret
 816
 817 the pxor is not needed, we could compare the value against itself.
 818
 819 //===---------------------------------------------------------------------===//
 820
 821 These two functions have identical effects:
 822
 823 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 824 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 825
 826 We currently compile them to:
 827
 828 _f:
 829         movl 4(%esp), %eax
 830         movl %eax, %ecx
 831         incl %ecx
 832         movl 8(%esp), %edx
 833         cmpl %edx, %ecx
 834         jne LBB1_2      #UnifiedReturnBlock
 835 LBB1_1: #cond_true
 836         addl $2, %eax
 837         ret
 838 LBB1_2: #UnifiedReturnBlock
 839         movl %ecx, %eax
 840         ret
 841 _f2:
 842         movl 4(%esp), %eax
 843         movl %eax, %ecx
 844         incl %ecx
 845         cmpl 8(%esp), %ecx
 846         sete %cl
 847         movzbl %cl, %ecx
 848         leal 1(%ecx,%eax), %eax
 849         ret
 850
 851 both of which are inferior to GCC's:
 852
 853 _f:
 854         movl    4(%esp), %edx
 855         leal    1(%edx), %eax
 856         addl    $2, %edx
 857         cmpl    8(%esp), %eax
 858         cmove   %edx, %eax
 859         ret
 860 _f2:
 861         movl    4(%esp), %eax
 862         addl    $1, %eax
 863         xorl    %edx, %edx
 864         cmpl    8(%esp), %eax
 865         sete    %dl
 866         addl    %edx, %eax
 867         ret
 868
 869 //===---------------------------------------------------------------------===//
 870
 871 This code:
 872
 873 void test(int X) {
 874   if (X) abort();
 875 }
 876
 877 is currently compiled to:
 878
 879 _test:
 880         subl $12, %esp
 881         cmpl $0, 16(%esp)
 882         jne LBB1_1
 883         addl $12, %esp
 884         ret
 885 LBB1_1:
 886         call L_abort$stub
 887
 888 It would be better to produce:
 889
 890 _test:
 891         subl $12, %esp
 892         cmpl $0, 16(%esp)
 893         jne L_abort$stub
 894         addl $12, %esp
 895         ret
 896
 897 This can be applied to any no-return function call that takes no arguments etc.
 898 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 899 something like this:
 900
 901 _test:
 902         cmpl $0, 4(%esp)
 903         jne LBB1_1
 904         ret
 905 LBB1_1:
 906         subl $12, %esp
 907         call L_abort$stub
 908
 909 Both are useful in different situations.  Finally, it could be shrink-wrapped
 910 and tail called, like this:
 911
 912 _test:
 913         cmpl $0, 4(%esp)
 914         jne LBB1_1
 915         ret
 916 LBB1_1:
 917         pop %eax   # realign stack.
 918         call L_abort$stub
 919
 920 Though this probably isn't worth it.
 921
 922 //===---------------------------------------------------------------------===//