//===---------------------------------------------------------------------===//
-Legalize should lower ctlz like this:
- ctlz(x) = popcnt((x-1) & ~x)
-
-on targets that have popcnt but not ctlz. itanium, what else?
-
-//===---------------------------------------------------------------------===//
-
quantum_sigma_x in 462.libquantum contains the following loop:
for(i=0; i<reg->size; i++)
... which would only do one 32-bit XOR per loop iteration instead of two.
It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
-alas...
+alas.
+
+//===---------------------------------------------------------------------===//
+
+This should be optimized to one 'and' and one 'or', from PR4216:
+
+define i32 @test_bitfield(i32 %bf.prev.low) nounwind ssp {
+entry:
+ %bf.prev.lo.cleared10 = or i32 %bf.prev.low, 32962 ; <i32> [#uses=1]
+ %0 = and i32 %bf.prev.low, -65536 ; <i32> [#uses=1]
+ %1 = and i32 %bf.prev.lo.cleared10, 40186 ; <i32> [#uses=1]
+ %2 = or i32 %1, %0 ; <i32> [#uses=1]
+ ret i32 %2
+}
//===---------------------------------------------------------------------===//
extra register for the function when effective_addr2 is declared as U64 than
when it is declared U32.
-//===---------------------------------------------------------------------===//
-
-Promote for i32 bswap can use i64 bswap + shr. Useful on targets with 64-bit
-regs and bswap, like itanium.
+PHI Slicing could be extended to do this.
//===---------------------------------------------------------------------===//
for (i = 0; i < N; i++) { X = i; Y = i*4; }
}
-produces two identical IV's (after promotion) on PPC/ARM:
+produces two near identical IV's (after promotion) on PPC/ARM:
-LBB1_1: @bb.preheader
- mov r3, #0
- mov r2, r3
- mov r1, r3
-LBB1_2: @bb
- ldr r12, LCPI1_0
- ldr r12, [r12]
- strh r2, [r12]
- ldr r12, LCPI1_1
- ldr r12, [r12]
- strh r3, [r12]
- add r1, r1, #1 <- [0,+,1]
- add r3, r3, #4
- add r2, r2, #1 <- [0,+,1]
- cmp r1, r0
- bne LBB1_2 @bb
+LBB1_2:
+ ldr r3, LCPI1_0
+ ldr r3, [r3]
+ strh r2, [r3]
+ ldr r3, LCPI1_1
+ ldr r3, [r3]
+ strh r1, [r3]
+ add r1, r1, #4
+ add r2, r2, #1 <- [0,+,1]
+ sub r0, r0, #1 <- [0,-,1]
+ cmp r0, #0
+ bne LBB1_2
+
+LSR should reuse the "+" IV for the exit test.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Tail recursion elimination is not transforming this function, because it is
-returning n, which fails the isDynamicConstant check in the accumulator
-recursion checks.
-
-long long fib(const long long n) {
- switch(n) {
- case 0:
- case 1:
- return n;
- default:
- return fib(n-1) + fib(n-2);
- }
-}
-
-//===---------------------------------------------------------------------===//
-
Tail recursion elimination should handle:
int pow2m1(int n) {
//===---------------------------------------------------------------------===//
-This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
-contains a testcase that compiles down to:
-
- %struct.XMM128 = type { <4 x float> }
-..
- %src = alloca %struct.XMM128
-..
- %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
- %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
- store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
- %tmp66 = load <4 x float>* %tmp65, align 16
- %tmp71 = add <4 x float> %tmp66, %tmp66
-
-If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
-into a bitcast of the vector value and a store to the pointer, then the
-store->load could be easily removed.
-
-//===---------------------------------------------------------------------===//
-
Consider:
int test() {
//===---------------------------------------------------------------------===//
-We should be able to evaluate this loop:
-
-int test(int x_offs) {
- while (x_offs > 4)
- x_offs -= 4;
- return x_offs;
-}
-
-//===---------------------------------------------------------------------===//
-
Reassociate should turn things like:
int factorial(int X) {
//===---------------------------------------------------------------------===//
-We would like to do the following transform in the instcombiner:
-
- -X/C -> X/-C
-
-However, this isn't valid if (-X) overflows. We can implement this when we
-have the concept of a "C signed subtraction" operator that which is undefined
-on overflow.
-
-//===---------------------------------------------------------------------===//
-
This was noticed in the entryblock for grokdeclarator in 403.gcc:
%tmp = icmp eq i32 %decl_context, 4
//===---------------------------------------------------------------------===//
+[PHI TRANSLATE INDEXED GEPs] PR5313
+
+Load redundancy elimination for simple loop. This loop:
+
+void append_text(const char* text,unsigned char * const io) {
+ while(*text)
+ *io=*text++;
+}
+
+Compiles to have a fully redundant load in the loop (%2):
+
+define void @append_text(i8* nocapture %text, i8* nocapture %io) nounwind {
+entry:
+ %0 = load i8* %text, align 1 ; <i8> [#uses=1]
+ %1 = icmp eq i8 %0, 0 ; <i1> [#uses=1]
+ br i1 %1, label %return, label %bb
+
+bb: ; preds = %bb, %entry
+ %indvar = phi i32 [ 0, %entry ], [ %tmp, %bb ] ; <i32> [#uses=2]
+ %text_addr.04 = getelementptr i8* %text, i32 %indvar ; <i8*> [#uses=1]
+ %2 = load i8* %text_addr.04, align 1 ; <i8> [#uses=1]
+ store i8 %2, i8* %io, align 1
+ %tmp = add i32 %indvar, 1 ; <i32> [#uses=2]
+ %scevgep = getelementptr i8* %text, i32 %tmp ; <i8*> [#uses=1]
+ %3 = load i8* %scevgep, align 1 ; <i8> [#uses=1]
+ %4 = icmp eq i8 %3, 0 ; <i1> [#uses=1]
+ br i1 %4, label %return, label %bb
+
+return: ; preds = %bb, %entry
+ ret void
+}
+
+//===---------------------------------------------------------------------===//
+
There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
GCC testsuite. There are many pre testcases as ssa-pre-*.c
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34677 (licm does this, LPRE crit edge)
llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | opt -mem2reg -simplifycfg -gvn | llvm-dis
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS]
+
//===---------------------------------------------------------------------===//
Type based alias analysis:
//===---------------------------------------------------------------------===//
-When GVN/PRE finds a store of float* to a must aliases pointer when expecting
-an int*, it should turn it into a bitcast. This is a nice generalization of
-the SROA hack that would apply to other cases, e.g.:
-
-int foo(int C, int *P, float X) {
- if (C) {
- bar();
- *P = 42;
- } else
- *(float*)P = X;
-
- return *P;
-}
-
-
-One example (that requires crazy phi translation) is:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS]
-
-//===---------------------------------------------------------------------===//
-
A/B get pinned to the stack because we turn an if/then into a select instead
of PRE'ing the load/store. This may be fixable in instcombine:
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
+struct X { int i; };
+int foo (int x) {
+ struct X a;
+ struct X b;
+ struct X *p;
+ a.i = 1;
+ b.i = 2;
+ if (x)
+ p = &a;
+ else
+ p = &b;
+ return p->i;
+}
+//===---------------------------------------------------------------------===//
Interesting missed case because of control flow flattening (should be 2 loads):
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
//===---------------------------------------------------------------------===//
-Instcombine should replace the load with a constant in:
+int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; }
- static const char x[4] = {'a', 'b', 'c', 'd'};
-
- unsigned int y(void) {
- return *(unsigned int *)x;
- }
+Generates this:
+
+define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+ %0 = and i32 %a, 128 ; <i32> [#uses=1]
+ %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1]
+ %2 = or i32 %b, 128 ; <i32> [#uses=1]
+ %3 = and i32 %b, -129 ; <i32> [#uses=1]
+ %b_addr.0 = select i1 %1, i32 %3, i32 %2 ; <i32> [#uses=1]
+ ret i32 %b_addr.0
+}
+
+However, it's functionally equivalent to:
+
+ b = (b & ~0x80) | (a & 0x80);
+
+Which generates this:
+
+define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+ %0 = and i32 %b, -129 ; <i32> [#uses=1]
+ %1 = and i32 %a, 128 ; <i32> [#uses=1]
+ %2 = or i32 %0, %1 ; <i32> [#uses=1]
+ ret i32 %2
+}
+
+This can be generalized for other forms:
-It currently only does this transformation when the size of the constant
-is the same as the size of the integer (so, try x[5]) and the last byte
-is a null (making it a C string). There's no need for these restrictions.
+ b = (b & ~0x80) | (a & 0x40) << 1;
//===---------------------------------------------------------------------===//
-InstCombine's "turn load from constant into constant" optimization should be
-more aggressive in the presence of bitcasts. For example, because of unions,
-this code:
+These two functions produce different code. They shouldn't:
-union vec2d {
- double e[2];
- double v __attribute__((vector_size(16)));
-};
-typedef union vec2d vec2d;
+#include <stdint.h>
+
+uint8_t p1(uint8_t b, uint8_t a) {
+ b = (b & ~0xc0) | (a & 0xc0);
+ return (b);
+}
+
+uint8_t p2(uint8_t b, uint8_t a) {
+ b = (b & ~0x40) | (a & 0x40);
+ b = (b & ~0x80) | (a & 0x80);
+ return (b);
+}
+
+define zeroext i8 @p1(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
+entry:
+ %0 = and i8 %b, 63 ; <i8> [#uses=1]
+ %1 = and i8 %a, -64 ; <i8> [#uses=1]
+ %2 = or i8 %1, %0 ; <i8> [#uses=1]
+ ret i8 %2
+}
-static vec2d a={{1,2}}, b={{3,4}};
-
-vec2d foo () {
- return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v };
+define zeroext i8 @p2(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
+entry:
+ %0 = and i8 %b, 63 ; <i8> [#uses=1]
+ %.masked = and i8 %a, 64 ; <i8> [#uses=1]
+ %1 = and i8 %a, -128 ; <i8> [#uses=1]
+ %2 = or i8 %1, %0 ; <i8> [#uses=1]
+ %3 = or i8 %2, %.masked ; <i8> [#uses=1]
+ ret i8 %3
}
-Compiles into:
+//===---------------------------------------------------------------------===//
-@a = internal constant %0 { [2 x double]
- [double 1.000000e+00, double 2.000000e+00] }, align 16
-@b = internal constant %0 { [2 x double]
- [double 3.000000e+00, double 4.000000e+00] }, align 16
-...
-define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind {
+IPSCCP does not currently propagate argument dependent constants through
+functions where it does not not all of the callers. This includes functions
+with normal external linkage as well as templates, C99 inline functions etc.
+Specifically, it does nothing to:
+
+define i32 @test(i32 %x, i32 %y, i32 %z) nounwind {
entry:
- %0 = load <2 x double>* getelementptr (%struct.vec2d*
- bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16
- %1 = load <2 x double>* getelementptr (%struct.vec2d*
- bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16
+ %0 = add nsw i32 %y, %z
+ %1 = mul i32 %0, %x
+ %2 = mul i32 %y, %z
+ %3 = add nsw i32 %1, %2
+ ret i32 %3
+}
+
+define i32 @test2() nounwind {
+entry:
+ %0 = call i32 @test(i32 1, i32 2, i32 4) nounwind
+ ret i32 %0
+}
+
+It would be interesting extend IPSCCP to be able to handle simple cases like
+this, where all of the arguments to a call are constant. Because IPSCCP runs
+before inlining, trivial templates and inline functions are not yet inlined.
+The results for a function + set of constant arguments should be memoized in a
+map.
+
+//===---------------------------------------------------------------------===//
+
+The libcall constant folding stuff should be moved out of SimplifyLibcalls into
+libanalysis' constantfolding logic. This would allow IPSCCP to be able to
+handle simple things like this:
+
+static int foo(const char *X) { return strlen(X); }
+int bar() { return foo("abcd"); }
+//===---------------------------------------------------------------------===//
+
+InstCombine should use SimplifyDemandedBits to remove the or instruction:
-Instcombine should be able to optimize away the loads (and thus the globals).
+define i1 @test(i8 %x, i8 %y) {
+ %A = or i8 %x, 1
+ %B = icmp ugt i8 %A, 3
+ ret i1 %B
+}
+Currently instcombine calls SimplifyDemandedBits with either all bits or just
+the sign bit, if the comparison is obviously a sign test. In this case, we only
+need all but the bottom two bits from %A, and if we gave that mask to SDB it
+would delete the or instruction for us.
//===---------------------------------------------------------------------===//