... which should only be one imul instruction.
+or:
+
+unsigned long long int t2(unsigned int a, unsigned int b) {
+ return (unsigned long long)a * b;
+}
+
+... which should be one mul instruction.
+
+
This can be done with a custom expander, but it would be nice to move this to
generic code.
//===---------------------------------------------------------------------===//
-Use push/pop instructions in prolog/epilog sequences instead of stores off
-ESP (certain code size win, perf win on some [which?] processors).
-Also, it appears icc use push for parameter passing. Need to investigate.
+It appears icc use push for parameter passing. Need to investigate.
//===---------------------------------------------------------------------===//
There are 3 issues:
1. Lack of post regalloc LICM.
-2. Poor sub-regclass support. That leads to inability to promote the 16-bit
- arithmetic op to 32-bit and making use of leal.
-3. LSR unable to reused IV for a different type (i16 vs. i32) even though
+2. LSR unable to reused IV for a different type (i16 vs. i32) even though
the cast would be free.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Bad codegen:
-
-char foo(int x) { return x; }
-
-_foo:
- movl 4(%esp), %eax
- shll $24, %eax
- sarl $24, %eax
- ret
-
-SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
-sub-registers.
-
-//===---------------------------------------------------------------------===//
-
Consider this:
typedef struct pair { float A, B; } pair;
movl %edi, %eax
ret
-//===---------------------------------------------------------------------===//
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y. Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered. However, this copy is not needed if the register
+;; allocator turns the shift into an LEA. This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN: not grep {mov E.X, E.X}
-We use push/pop of stack space around calls in situations where we don't have to.
-Call to f below produces:
- subl $16, %esp <<<<<
- movl %eax, (%esp)
- call L_f$stub
- addl $16, %esp <<<<<
-The stack push/pop can be moved into the prolog/epilog. It does this because it's
-building the frame pointer, but this should not be sufficient, only the use of alloca
-should cause it to do this.
-(There are other issues shown by this code, but this is one.)
-
-typedef struct _range_t {
- float fbias;
- float fscale;
- int ibias;
- int iscale;
- int ishift;
- unsigned char lut[];
-} range_t;
-
-struct _decode_t {
- int type:4;
- int unit:4;
- int alpha:8;
- int N:8;
- int bpc:8;
- int bpp:16;
- int skip:8;
- int swap:8;
- const range_t*const*range;
-};
-
-typedef struct _decode_t decode_t;
-
-extern int f(const decode_t* decode);
-
-int decode_byte (const decode_t* decode) {
- if (decode->swap != 0)
- return f(decode);
- return 0;
+%G = external global int
+
+int %test1(int %X, int %Y) {
+ %Z = add int %X, %Y
+ volatile store int %Y, int* %G
+ volatile store int %Z, int* %G
+ ret int %X
}
+int %test2(int %X) {
+ %Z = add int %X, 1 ;; inc
+ volatile store int %Z, int* %G
+ ret int %X
+}
//===---------------------------------------------------------------------===//
has this xform, but it is currently disabled until the alignment fields of
the load/store nodes are trustworthy.
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction. Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+ movl $7, %eax
+ movsbl 4(%esp), %ecx
+ subl %ecx, %eax
+ ret
+
+We would use one fewer register if codegen'd as:
+ movsbl 4(%esp), %eax
+ neg %eax
+ add $7, %eax
+ ret
+
+Note that this isn't beneficial if the load can be folded into the sub. In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+ movl $7, %eax
+ subl 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+For code like:
+phi (undef, x)
+
+We get an implicit def on the undef side. If the phi is spilled, we then get:
+implicitdef xmm1
+store xmm1 -> stack
+
+It should be possible to teach the x86 backend to "fold" the store into the
+implicitdef, which just deletes the implicit def.
+
+These instructions should go away:
+#IMPLICIT_DEF %xmm1
+movaps %xmm1, 192(%esp)
+movaps %xmm1, 224(%esp)
+movaps %xmm1, 176(%esp)
+
+//===---------------------------------------------------------------------===//
+
+This is a "commutable two-address" register coallescing deficiency:
+
+define <4 x float> @test1(<4 x float> %V) {
+entry:
+ %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 > ; <<4 x float>> [#uses=1]
+ %add = add <4 x float> %tmp8, %V ; <<4 x float>> [#uses=1]
+ ret <4 x float> %add
+}
+
+this codegens to:
+
+_test1:
+ pshufd $27, %xmm0, %xmm1
+ addps %xmm0, %xmm1
+ movaps %xmm1, %xmm0
+ ret
+
+instead of:
+
+_test1:
+ pshufd $27, %xmm0, %xmm1
+ addps %xmm1, %xmm0
+ ret