lib/Target/X86/README-X86-64.txt

   1 //===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
   2
   3 AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
   4 multiplication by a constant. How much of it applies to Intel's X86-64
   5 implementation? There are definite trade-offs to consider: latency vs. register
   6 pressure vs. code size.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Are we better off using branches instead of cmove to implement FP to
  11 unsigned i64?
  12
  13 _conv:
  14         ucomiss LC0(%rip), %xmm0
  15         cvttss2siq      %xmm0, %rdx
  16         jb      L3
  17         subss   LC0(%rip), %xmm0
  18         movabsq $-9223372036854775808, %rax
  19         cvttss2siq      %xmm0, %rdx
  20         xorq    %rax, %rdx
  21 L3:
  22         movq    %rdx, %rax
  23         ret
  24
  25 instead of
  26
  27 _conv:
  28         movss LCPI1_0(%rip), %xmm1
  29         cvttss2siq %xmm0, %rcx
  30         movaps %xmm0, %xmm2
  31         subss %xmm1, %xmm2
  32         cvttss2siq %xmm2, %rax
  33         movabsq $-9223372036854775808, %rdx
  34         xorq %rdx, %rax
  35         ucomiss %xmm1, %xmm0
  36         cmovb %rcx, %rax
  37         ret
  38
  39 Seems like the jb branch has high likelyhood of being taken. It would have
  40 saved a few instructions.
  41
  42 //===---------------------------------------------------------------------===//
  43
  44 Poor codegen:
  45
  46 int X[2];
  47 int b;
  48 void test(void) {
  49   memset(X, b, 2*sizeof(X[0]));
  50 }
  51
  52 llc:
  53         movq _b@GOTPCREL(%rip), %rax
  54         movzbq (%rax), %rax
  55         movq %rax, %rcx
  56         shlq $8, %rcx
  57         orq %rax, %rcx
  58         movq %rcx, %rax
  59         shlq $16, %rax
  60         orq %rcx, %rax
  61         movq %rax, %rcx
  62         shlq $32, %rcx
  63         movq _X@GOTPCREL(%rip), %rdx
  64         orq %rax, %rcx
  65         movq %rcx, (%rdx)
  66         ret
  67
  68 gcc:
  69         movq    _b@GOTPCREL(%rip), %rax
  70         movabsq $72340172838076673, %rdx
  71         movzbq  (%rax), %rax
  72         imulq   %rdx, %rax
  73         movq    _X@GOTPCREL(%rip), %rdx
  74         movq    %rax, (%rdx)
  75         ret
  76
  77 //===---------------------------------------------------------------------===//
  78
  79 It's not possible to reference AH, BH, CH, and DH registers in an instruction
  80 requiring REX prefix. However, divb and mulb both produce results in AH. If isel
  81 emits a CopyFromReg which gets turned into a movb and that can be allocated a
  82 r8b - r15b.
  83
  84 To get around this, isel emits a CopyFromReg from AX and then right shift it
  85 down by 8 and truncate it. It's not pretty but it works. We need some register
  86 allocation magic to make the hack go away (e.g. putting additional constraints
  87 on the result of the movb).
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 The x86-64 ABI for hidden-argument struct returns requires that the
  92 incoming value of %rdi be copied into %rax by the callee upon return.
  93
  94 The idea is that it saves callers from having to remember this value,
  95 which would often require a callee-saved register. Callees usually
  96 need to keep this value live for most of their body anyway, so it
  97 doesn't add a significant burden on them.
  98
  99 We currently implement this in codegen, however this is suboptimal
 100 because it means that it would be quite awkward to implement the
 101 optimization for callers.
 102
 103 A better implementation would be to relax the LLVM IR rules for sret
 104 arguments to allow a function with an sret argument to have a non-void
 105 return type, and to have the front-end to set up the sret argument value
 106 as the return value of the function. The front-end could more easily
 107 emit uses of the returned struct value to be in terms of the function's
 108 lowered return value, and it would free non-C frontends from a
 109 complication only required by a C-based ABI.
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 We get a redundant zero extension for code like this:
 114
 115 int mask[1000];
 116 int foo(unsigned x) {
 117  if (x < 10)
 118    x = x * 45;
 119  else
 120    x = x * 78;
 121  return mask[x];
 122 }
 123
 124 _foo:
 125 LBB1_0: ## entry
 126         cmpl    $9, %edi
 127         jbe     LBB1_3  ## bb
 128 LBB1_1: ## bb1
 129         imull   $78, %edi, %eax
 130 LBB1_2: ## bb2
 131         movl    %eax, %eax                    <----
 132         movq    _mask@GOTPCREL(%rip), %rcx
 133         movl    (%rcx,%rax,4), %eax
 134         ret
 135 LBB1_3: ## bb
 136         imull   $45, %edi, %eax
 137         jmp     LBB1_2  ## bb2
 138
 139 Before regalloc, we have:
 140
 141         %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
 142         JMP mbb<bb2,0x203afb0>
 143     Successors according to CFG: 0x203afb0 (#3)
 144
 145 bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
 146     Predecessors according to CFG: 0x203aec0 (#0)
 147         %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
 148     Successors according to CFG: 0x203afb0 (#3)
 149
 150 bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
 151     Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
 152         %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
 153                             %reg1026, mbb<bb1,0x203af60>
 154         %reg1029<def> = MOVZX64rr32 %reg1027
 155
 156 so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
 157 be able to recognize the zero extend.  This could also presumably be implemented
 158 if we have whole-function selectiondags.
 159
 160 //===---------------------------------------------------------------------===//