1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend.
3 //===---------------------------------------------------------------------===//
5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
6 Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to
7 X86, & make the dag combiner produce it when needed. This will eliminate one
8 imul from the code generated for:
10 long long test(long long X, long long Y) { return X*Y; }
12 by using the EAX result from the mul. We should add a similar node for
17 long long test(int X, int Y) { return (long long)X*Y; }
19 ... which should only be one imul instruction.
21 //===---------------------------------------------------------------------===//
23 This should be one DIV/IDIV instruction, not a libcall:
25 unsigned test(unsigned long long X, unsigned Y) {
29 This can be done trivially with a custom legalizer. What about overflow
30 though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
32 //===---------------------------------------------------------------------===//
34 Improvements to the multiply -> shift/add algorithm:
35 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
37 //===---------------------------------------------------------------------===//
39 Improve code like this (occurs fairly frequently, e.g. in LLVM):
40 long long foo(int x) { return 1LL << x; }
42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
43 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
44 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
46 Another useful one would be ~0ULL >> X and ~0ULL << X.
48 //===---------------------------------------------------------------------===//
51 _Bool f(_Bool a) { return a!=1; }
58 //===---------------------------------------------------------------------===//
62 1. Dynamic programming based approach when compile time if not an
64 2. Code duplication (addressing mode) during isel.
65 3. Other ideas from "Register-Sensitive Selection, Duplication, and
66 Sequencing of Instructions".
67 4. Scheduling for reduced register pressure. E.g. "Minimum Register
68 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
69 and other related papers.
70 http://citeseer.ist.psu.edu/govindarajan01minimum.html
72 //===---------------------------------------------------------------------===//
74 Should we promote i16 to i32 to avoid partial register update stalls?
76 //===---------------------------------------------------------------------===//
78 Leave any_extend as pseudo instruction and hint to register
79 allocator. Delay codegen until post register allocation.
81 //===---------------------------------------------------------------------===//
83 Count leading zeros and count trailing zeros:
85 int clz(int X) { return __builtin_clz(X); }
86 int ctz(int X) { return __builtin_ctz(X); }
88 $ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
90 bsr %eax, DWORD PTR [%esp+4]
94 bsf %eax, DWORD PTR [%esp+4]
97 however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
100 //===---------------------------------------------------------------------===//
102 Use push/pop instructions in prolog/epilog sequences instead of stores off
103 ESP (certain code size win, perf win on some [which?] processors).
104 Also, it appears icc use push for parameter passing. Need to investigate.
106 //===---------------------------------------------------------------------===//
108 Only use inc/neg/not instructions on processors where they are faster than
109 add/sub/xor. They are slower on the P4 due to only updating some processor
112 //===---------------------------------------------------------------------===//
114 The instruction selector sometimes misses folding a load into a compare. The
115 pattern is written as (cmp reg, (load p)). Because the compare isn't
116 commutative, it is not matched with the load on both sides. The dag combiner
117 should be made smart enough to cannonicalize the load into the RHS of a compare
118 when it can invert the result of the compare for free.
120 //===---------------------------------------------------------------------===//
122 How about intrinsics? An example is:
123 *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
126 pmuludq (%eax), %xmm0
131 The transformation probably requires a X86 specific pass or a DAG combiner
132 target specific hook.
134 //===---------------------------------------------------------------------===//
136 In many cases, LLVM generates code like this:
145 on some processors (which ones?), it is more efficient to do this:
154 Doing this correctly is tricky though, as the xor clobbers the flags.
156 //===---------------------------------------------------------------------===//
158 We should generate bts/btr/etc instructions on targets where they are cheap or
159 when codesize is important. e.g., for:
161 void setbit(int *target, int bit) {
162 *target |= (1 << bit);
164 void clearbit(int *target, int bit) {
165 *target &= ~(1 << bit);
168 //===---------------------------------------------------------------------===//
170 Instead of the following for memset char*, 1, 10:
172 movl $16843009, 4(%edx)
173 movl $16843009, (%edx)
176 It might be better to generate
183 when we can spare a register. It reduces code size.
185 //===---------------------------------------------------------------------===//
187 Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
204 GCC knows several different ways to codegen it, one of which is this:
214 which is probably slower, but it's interesting at least :)
216 //===---------------------------------------------------------------------===//
218 Should generate min/max for stuff like:
220 void minf(float a, float b, float *X) {
224 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
225 and ISD::FMAX node types?
227 //===---------------------------------------------------------------------===//
229 The first BB of this code:
233 %V = call bool %foo()
234 br bool %V, label %T, label %F
251 It would be better to emit "cmp %al, 1" than a xor and test.
253 //===---------------------------------------------------------------------===//
255 Enable X86InstrInfo::convertToThreeAddress().
257 //===---------------------------------------------------------------------===//
259 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
260 We should leave these as libcalls for everything over a much lower threshold,
261 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
262 stores, TLB preheating, etc)
264 //===---------------------------------------------------------------------===//
266 Optimize this into something reasonable:
267 x * copysign(1.0, y) * copysign(1.0, z)
269 //===---------------------------------------------------------------------===//
271 Optimize copysign(x, *y) to use an integer load from y.
273 //===---------------------------------------------------------------------===//
275 %X = weak global int 0
278 %N = cast int %N to uint
279 %tmp.24 = setgt int %N, 0
280 br bool %tmp.24, label %no_exit, label %return
283 %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
284 %i.0.0 = cast uint %indvar to int
285 volatile store int %i.0.0, int* %X
286 %indvar.next = add uint %indvar, 1
287 %exitcond = seteq uint %indvar.next, %N
288 br bool %exitcond, label %return, label %no_exit
302 jl LBB_foo_4 # return
303 LBB_foo_1: # no_exit.preheader
306 movl L_X$non_lazy_ptr, %edx
310 jne LBB_foo_2 # no_exit
311 LBB_foo_3: # return.loopexit
315 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
316 remateralization is implemented. This can be accomplished with 1) a target
317 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
319 //===---------------------------------------------------------------------===//
321 The following tests perform worse with LSR:
323 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
325 //===---------------------------------------------------------------------===//
327 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
330 //===---------------------------------------------------------------------===//
338 Obviously it would have been better for the first mov (or any op) to store
339 directly %esp[0] if there are no other uses.
341 //===---------------------------------------------------------------------===//
343 Adding to the list of cmp / test poor codegen issues:
345 int test(__m128 *A, __m128 *B) {
346 if (_mm_comige_ss(*A, *B))
366 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
367 are a number of issues. 1) We are introducing a setcc between the result of the
368 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
369 so a any extend (which becomes a zero extend) is added.
371 We probably need some kind of target DAG combine hook to fix this.
373 //===---------------------------------------------------------------------===//
375 We generate significantly worse code for this than GCC:
376 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
377 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
379 There is also one case we do worse on PPC.
381 //===---------------------------------------------------------------------===//
383 If shorter, we should use things like:
388 The former can also be used when the two-addressy nature of the 'and' would
389 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
391 //===---------------------------------------------------------------------===//
395 char foo(int x) { return x; }
403 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
406 //===---------------------------------------------------------------------===//
410 typedef struct pair { float A, B; } pair;
411 void pairtest(pair P, float *FP) {
415 We currently generate this code with llvmgcc4:
430 we should be able to generate:
438 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
439 integer chunks. It does this so that structs like {short,short} are passed in
440 a single 32-bit integer stack slot. We should handle the safe cases above much
441 nicer, while still handling the hard cases.
443 //===---------------------------------------------------------------------===//
445 Another instruction selector deficiency:
448 %tmp = load int (int)** %foo
449 %tmp = tail call int %tmp( int 3 )
455 movl L_foo$non_lazy_ptr, %eax
461 The current isel scheme will not allow the load to be folded in the call since
462 the load's chain result is read by the callseq_start.
464 //===---------------------------------------------------------------------===//
466 Don't forget to find a way to squash noop truncates in the JIT environment.
468 //===---------------------------------------------------------------------===//
470 Implement anyext in the same manner as truncate that would allow them to be
473 //===---------------------------------------------------------------------===//
475 How about implementing truncate / anyext as a property of machine instruction
476 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
477 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
478 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
480 //===---------------------------------------------------------------------===//
490 imull $3, 4(%esp), %eax
492 Perhaps this is what we really should generate is? Is imull three or four
493 cycles? Note: ICC generates this:
495 leal (%eax,%eax,2), %eax
497 The current instruction priority is based on pattern complexity. The former is
498 more "complex" because it folds a load so the latter will not be emitted.
500 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
501 should always try to match LEA first since the LEA matching code does some
502 estimate to determine whether the match is profitable.
504 However, if we care more about code size, then imull is better. It's two bytes
505 shorter than movl + leal.
507 //===---------------------------------------------------------------------===//
509 Implement CTTZ, CTLZ with bsf and bsr.
511 //===---------------------------------------------------------------------===//
513 It appears gcc place string data with linkonce linkage in
514 .section __TEXT,__const_coal,coalesced instead of
515 .section __DATA,__const_coal,coalesced.
516 Take a look at darwin.h, there are other Darwin assembler directives that we
519 //===---------------------------------------------------------------------===//
521 We should handle __attribute__ ((__visibility__ ("hidden"))).
523 //===---------------------------------------------------------------------===//
525 int %foo(int* %a, int %t) {
529 cond_true: ; preds = %cond_true, %entry
530 %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ] ; <int> [#uses=3]
531 %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ] ; <int> [#uses=1]
532 %tmp2 = getelementptr int* %a, int %x.0.0 ; <int*> [#uses=1]
533 %tmp3 = load int* %tmp2 ; <int> [#uses=1]
534 %tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
535 %tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
536 %tmp9 = add int %x.0.0, 1 ; <int> [#uses=2]
537 %tmp = setgt int %tmp9, 39 ; <bool> [#uses=1]
538 br bool %tmp, label %bb12, label %cond_true
540 bb12: ; preds = %cond_true
544 is pessimized by -loop-reduce and -indvars
546 //===---------------------------------------------------------------------===//
548 Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3.
550 //===---------------------------------------------------------------------===//
552 u32 to float conversion improvement:
554 float uint32_2_float( unsigned u ) {
555 float fl = (int) (u & 0xffff);
556 float fh = (int) (u >> 16);
561 00000000 subl $0x04,%esp
562 00000003 movl 0x08(%esp,1),%eax
563 00000007 movl %eax,%ecx
564 00000009 shrl $0x10,%ecx
565 0000000c cvtsi2ss %ecx,%xmm0
566 00000010 andl $0x0000ffff,%eax
567 00000015 cvtsi2ss %eax,%xmm1
568 00000019 mulss 0x00000078,%xmm0
569 00000021 addss %xmm1,%xmm0
570 00000025 movss %xmm0,(%esp,1)
571 0000002a flds (%esp,1)
572 0000002d addl $0x04,%esp
575 //===---------------------------------------------------------------------===//
577 When using fastcc abi, align stack slot of argument of type double on 8 byte
578 boundary to improve performance.
580 //===---------------------------------------------------------------------===//
584 int f(int a, int b) {
585 if (a == 4 || a == 6)
597 //===---------------------------------------------------------------------===//
600 int %test(ulong *%tmp) {
601 %tmp = load ulong* %tmp ; <ulong> [#uses=1]
602 %tmp.mask = shr ulong %tmp, ubyte 50 ; <ulong> [#uses=1]
603 %tmp.mask = cast ulong %tmp.mask to ubyte ; <ubyte> [#uses=1]
604 %tmp2 = and ubyte %tmp.mask, 3 ; <ubyte> [#uses=1]
605 %tmp2 = cast ubyte %tmp2 to int ; <int> [#uses=1]
624 # TRUNCATE movb %al, %al
629 This saves a movzbl, and saves a truncate if it doesn't get coallesced right.
630 This is a simple DAGCombine to propagate the zext through the and.