//===---------------------------------------------------------------------===//
-Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
+Reassociate should turn things like:
+
+int factorial(int X) {
+ return X*X*X*X*X*X*X*X;
+}
+
+into llvm.powi calls, allowing the code generator to produce balanced
+multiplication trees.
+
+First, the intrinsic needs to be extended to support integers, and second the
+code generator needs to be enhanced to lower these to multiplication trees.
//===---------------------------------------------------------------------===//
return bar(z, n) + bar(2*z, 2*n);
}
-Reassociate should handle the example in GCC PR16157.
+This is blocked on not handling X*X*X -> powi(X, 3) (see note above). The issue
+is that we end up getting t = 2*X s = t*t and don't turn this into 4*X*X,
+which is the same number of multiplies and is canonical, because the 2*X has
+multiple uses. Here's a simple example:
+
+define i32 @test15(i32 %X1) {
+ %B = mul i32 %X1, 47 ; X1*47
+ %C = mul i32 %B, %B
+ ret i32 %C
+}
+
+
+//===---------------------------------------------------------------------===//
+
+Reassociate should handle the example in GCC PR16157:
+
+extern int a0, a1, a2, a3, a4; extern int b0, b1, b2, b3, b4;
+void f () { /* this can be optimized to four additions... */
+ b4 = a4 + a3 + a2 + a1 + a0;
+ b3 = a3 + a2 + a1 + a0;
+ b2 = a2 + a1 + a0;
+ b1 = a1 + a0;
+}
+
+This requires reassociating to forms of expressions that are already available,
+something that reassoc doesn't think about yet.
+
+
+//===---------------------------------------------------------------------===//
+
+This function: (derived from GCC PR19988)
+double foo(double x, double y) {
+ return ((x + 0.1234 * y) * (x + -0.1234 * y));
+}
+
+compiles to:
+_foo:
+ movapd %xmm1, %xmm2
+ mulsd LCPI1_1(%rip), %xmm1
+ mulsd LCPI1_0(%rip), %xmm2
+ addsd %xmm0, %xmm1
+ addsd %xmm0, %xmm2
+ movapd %xmm1, %xmm0
+ mulsd %xmm2, %xmm0
+ ret
+
+Reassociate should be able to turn it into:
+
+double foo(double x, double y) {
+ return ((x + 0.1234 * y) * (x - 0.1234 * y));
+}
+
+Which allows the multiply by constant to be CSE'd, producing:
+
+_foo:
+ mulsd LCPI1_0(%rip), %xmm1
+ movapd %xmm1, %xmm2
+ addsd %xmm0, %xmm2
+ subsd %xmm1, %xmm0
+ mulsd %xmm2, %xmm0
+ ret
+
+This doesn't need -ffast-math support at all. This is particularly bad because
+the llvm-gcc frontend is canonicalizing the later into the former, but clang
+doesn't have this problem.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
-
-Compile:
-
-int bar(int x)
-{
- int t = __builtin_clz(x);
- return -(t>>5);
-}
-
-to:
-
-_bar: addic r3,r3,-1
- subfe r3,r3,r3
- blr
-
-//===---------------------------------------------------------------------===//
-
quantum_sigma_x in 462.libquantum contains the following loop:
for(i=0; i<reg->size; i++)
//===---------------------------------------------------------------------===//
-This should be optimized to one 'and' and one 'or', from PR4216:
-
-define i32 @test_bitfield(i32 %bf.prev.low) nounwind ssp {
-entry:
- %bf.prev.lo.cleared10 = or i32 %bf.prev.low, 32962 ; <i32> [#uses=1]
- %0 = and i32 %bf.prev.low, -65536 ; <i32> [#uses=1]
- %1 = and i32 %bf.prev.lo.cleared10, 40186 ; <i32> [#uses=1]
- %2 = or i32 %1, %0 ; <i32> [#uses=1]
- ret i32 %2
-}
-
-//===---------------------------------------------------------------------===//
-
This isn't recognized as bswap by instcombine (yes, it really is bswap):
unsigned long reverse(unsigned v) {
//===---------------------------------------------------------------------===//
+[LOOP RECOGNITION]
+
These idioms should be recognized as popcount (see PR1488):
unsigned countbits_slow(unsigned v) {
//===---------------------------------------------------------------------===//
+[LOOP RECOGNITION]
+
viterbi speeds up *significantly* if the various "history" related copy loops
are turned into memcpy calls at the source level. We need a "loops to memcpy"
pass.
//===---------------------------------------------------------------------===//
+[LOOP OPTIMIZATION]
+
+SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
+opportunities in its double_array_divs_variable function: it needs loop
+interchange, memory promotion (which LICM already does), vectorization and
+variable trip count loop unrolling (since it has a constant trip count). ICC
+apparently produces this very nice code with -ffast-math:
+
+..B1.70: # Preds ..B1.70 ..B1.69
+ mulpd %xmm0, %xmm1 #108.2
+ mulpd %xmm0, %xmm1 #108.2
+ mulpd %xmm0, %xmm1 #108.2
+ mulpd %xmm0, %xmm1 #108.2
+ addl $8, %edx #
+ cmpl $131072, %edx #108.2
+ jb ..B1.70 # Prob 99% #108.2
+
+It would be better to count down to zero, but this is a lot better than what we
+do.
+
+//===---------------------------------------------------------------------===//
+
Consider:
typedef unsigned U32;
//===---------------------------------------------------------------------===//
-Reassociate should turn things like:
-
-int factorial(int X) {
- return X*X*X*X*X*X*X*X;
-}
-
-into llvm.powi calls, allowing the code generator to produce balanced
-multiplication trees.
-
-//===---------------------------------------------------------------------===//
-
-We generate a horrible libcall for llvm.powi. For example, we compile:
-
-#include <cmath>
-double f(double a) { return std::pow(a, 4); }
-
-into:
-
-__Z1fd:
- subl $12, %esp
- movsd 16(%esp), %xmm0
- movsd %xmm0, (%esp)
- movl $4, 8(%esp)
- call L___powidf2$stub
- addl $12, %esp
- ret
-
-GCC produces:
-
-__Z1fd:
- subl $12, %esp
- movsd 16(%esp), %xmm0
- mulsd %xmm0, %xmm0
- mulsd %xmm0, %xmm0
- movsd %xmm0, (%esp)
- fldl (%esp)
- addl $12, %esp
- ret
-
-//===---------------------------------------------------------------------===//
-
We compile this program: (from GCC PR11680)
http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
true();
}
-I think this basically amounts to a dag combine to simplify comparisons against
-multiply hi's into a comparison against the mullo.
+This is equivalent to the following, where 2863311531 is the multiplicative
+inverse of 3, and 1431655766 is ((2^32)-1)/3+1:
+void bar(unsigned n) {
+ if (n * 2863311531U < 1431655766U)
+ true();
+}
+
+The same transformation can work with an even modulo with the addition of a
+rotate: rotate the result of the multiply to the right by the number of bits
+which need to be zero for the condition to be true, and shrink the compare RHS
+by the same amount. Unless the target supports rotates, though, that
+transformation probably isn't worthwhile.
+
+The transformation can also easily be made to work with non-zero equality
+comparisons: just transform, for example, "n % 3 == 1" to "(n-1) % 3 == 0".
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x -
-10) u< 10, but only when the comparisons have matching sign.
-
-This could be converted with a similiar technique. (PR1941)
-
-define i1 @test(i8 %x) {
- %A = icmp uge i8 %x, 5
- %B = icmp slt i8 %x, 20
- %C = and i1 %A, %B
- ret i1 %C
-}
-
-//===---------------------------------------------------------------------===//
-
These functions perform the same computation, but produce different assembly.
define i8 @select(i8 %x) readnone nounwind {
//===---------------------------------------------------------------------===//
-From GCC Bug 15241:
-unsigned int
-foo (unsigned int a, unsigned int b)
-{
- if (a <= 7 && b <= 7)
- baz ();
-}
-Should combine to "(a|b) <= 7". Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
From GCC Bug 3756:
int
pn (int n)
//===---------------------------------------------------------------------===//
-From GCC Bug 28685:
-int test(int a, int b)
-{
- int lt = a < b;
- int eq = a == b;
-
- return (lt || eq);
-}
-Should combine to "a <= b". Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
void a(int variable)
{
if (variable == 4 || variable == 6)
//===---------------------------------------------------------------------===//
-int a(unsigned char* b) {return *b > 99;}
-There's an unnecessary zext in the generated code with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
Should be combined to "((b >> 1) | b) & 1". Currently not optimized
with "clang -emit-llvm-bc | opt -std-compile-opts".
//===---------------------------------------------------------------------===//
-unsigned a(unsigned a) {return ((a | 1) & 3) | (a & -4);}
-Should combine to "a | 1". Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
Should fold to "(~a & c) | (a & b)". Currently not optimized with
"clang -emit-llvm-bc | opt -std-compile-opts".
//===---------------------------------------------------------------------===//
+[ALIAS ANALYSIS]
+
Type based alias analysis:
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
+We should do better analysis of posix_memalign. At the least it should
+no-capture its pointer argument, at best, we should know that the out-value
+result doesn't point to anything (like malloc). One example of this is in
+SingleSource/Benchmarks/Misc/dt.c
+
//===---------------------------------------------------------------------===//
A/B get pinned to the stack because we turn an if/then into a select instead
would delete the or instruction for us.
//===---------------------------------------------------------------------===//
+
+functionattrs doesn't know much about memcpy/memset. This function should be
+marked readnone rather than readonly, since it only twiddles local memory, but
+functionattrs doesn't handle memset/memcpy/memmove aggressively:
+
+struct X { int *p; int *q; };
+int foo() {
+ int i = 0, j = 1;
+ struct X x, y;
+ int **p;
+ y.p = &i;
+ x.q = &j;
+ p = __builtin_memcpy (&x, &y, sizeof (int *));
+ return **p;
+}
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+define i1 @a(i32 %x) nounwind readnone {
+entry:
+ %cmp = icmp eq i32 %x, 30
+ %sub = add i32 %x, -30
+ %cmp2 = icmp ugt i32 %sub, 9
+ %or = or i1 %cmp, %cmp2
+ ret i1 %or
+}
+This should be optimized to a single compare. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+void b();
+void a(int x) { if (((1<<x)&8)==0) b(); }
+
+The shift should be optimized out. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine or reassociate transformation:
+int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
+
+The sgt and slt should be combined into a single comparison. Testcase derived
+from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+ %rem = srem i32 %x, 32
+ %shl = shl i32 1, %rem
+ ret i32 %shl
+}
+
+The srem can be transformed to an and because if x is negative, the shift is
+undefined. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define i32 @a(i32 %x, i32 %y) nounwind readnone {
+entry:
+ %mul = mul i32 %y, -8
+ %sub = sub i32 %x, %mul
+ ret i32 %sub
+}
+
+Should compile to something like x+y*8, but currently compiles to an
+inefficient result. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define void @lshift_lt(i8 zeroext %a) nounwind {
+entry:
+ %conv = zext i8 %a to i32
+ %shl = shl i32 %conv, 3
+ %cmp = icmp ult i32 %shl, 33
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ tail call void @bar() nounwind
+ ret void
+
+if.end:
+ ret void
+}
+declare void @bar() nounwind
+
+The shift should be eliminated. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+These compile into different code, one gets recognized as a switch and the
+other doesn't due to phase ordering issues (PR6212):
+
+int test1(int mainType, int subType) {
+ if (mainType == 7)
+ subType = 4;
+ else if (mainType == 9)
+ subType = 6;
+ else if (mainType == 11)
+ subType = 9;
+ return subType;
+}
+
+int test2(int mainType, int subType) {
+ if (mainType == 7)
+ subType = 4;
+ if (mainType == 9)
+ subType = 6;
+ if (mainType == 11)
+ subType = 9;
+ return subType;
+}
+
+//===---------------------------------------------------------------------===//
+
+The following test case (from PR6576):
+
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %cond1 = icmp eq i32 %b, 0 ; <i1> [#uses=1]
+ br i1 %cond1, label %exit, label %bb.nph
+bb.nph: ; preds = %entry
+ %tmp = mul i32 %b, %a ; <i32> [#uses=1]
+ ret i32 %tmp
+exit: ; preds = %entry
+ ret i32 0
+}
+
+could be reduced to:
+
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = mul i32 %b, %a
+ ret i32 %tmp
+}
+
+//===---------------------------------------------------------------------===//
+