We should add an FRINT node to the DAG to model targets that have legal
implementations of ceil/floor/rint.
+
+//===---------------------------------------------------------------------===//
+
+This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
+contains a testcase that compiles down to:
+
+ %struct.XMM128 = type { <4 x float> }
+..
+ %src = alloca %struct.XMM128
+..
+ %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
+ %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
+ store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
+ %tmp66 = load <4 x float>* %tmp65, align 16
+ %tmp71 = add <4 x float> %tmp66, %tmp66
+
+If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
+into a bitcast of the vector value and a store to the pointer, then the
+store->load could be easily removed.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+int test() {
+ long long input[8] = {1,1,1,1,1,1,1,1};
+ foo(input);
+}
+
+We currently compile this into a memcpy from a global array since the
+initializer is fairly large and not memset'able. This is good, but the memcpy
+gets lowered to load/stores in the code generator. This is also ok, except
+that the codegen lowering for memcpy doesn't handle the case when the source
+is a constant global. This gives us atrocious code like this:
+
+ call "L1$pb"
+"L1$pb":
+ popl %eax
+ movl _C.0.1444-"L1$pb"+32(%eax), %ecx
+ movl %ecx, 40(%esp)
+ movl _C.0.1444-"L1$pb"+20(%eax), %ecx
+ movl %ecx, 28(%esp)
+ movl _C.0.1444-"L1$pb"+36(%eax), %ecx
+ movl %ecx, 44(%esp)
+ movl _C.0.1444-"L1$pb"+44(%eax), %ecx
+ movl %ecx, 52(%esp)
+ movl _C.0.1444-"L1$pb"+40(%eax), %ecx
+ movl %ecx, 48(%esp)
+ movl _C.0.1444-"L1$pb"+12(%eax), %ecx
+ movl %ecx, 20(%esp)
+ movl _C.0.1444-"L1$pb"+4(%eax), %ecx
+...
+
+instead of:
+ movl $1, 16(%esp)
+ movl $0, 20(%esp)
+ movl $1, 24(%esp)
+ movl $0, 28(%esp)
+ movl $1, 32(%esp)
+ movl $0, 36(%esp)
+ ...
+
+//===---------------------------------------------------------------------===//
+
+http://llvm.org/PR717:
+
+The following code should compile into "ret int undef". Instead, LLVM
+produces "ret int 0":
+
+int f() {
+ int x = 4;
+ int y;
+ if (x == 3) y = 0;
+ return y;
+}
+
+//===---------------------------------------------------------------------===//
+
+The loop unroller should partially unroll loops (instead of peeling them)
+when code growth isn't too bad and when an unroll count allows simplification
+of some code within the loop. One trivial example is:
+
+#include <stdio.h>
+int main() {
+ int nRet = 17;
+ int nLoop;
+ for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
+ if ( nLoop & 1 )
+ nRet += 2;
+ else
+ nRet -= 1;
+ }
+ return nRet;
+}
+
+Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
+reduction in code size. The resultant code would then also be suitable for
+exit value computation.
+
+//===---------------------------------------------------------------------===//
+
+We miss a bunch of rotate opportunities on various targets, including ppc, x86,
+etc. On X86, we miss a bunch of 'rotate by variable' cases because the rotate
+matching code in dag combine doesn't look through truncates aggressively
+enough. Here are some testcases reduces from GCC PR17886:
+
+unsigned long long f(unsigned long long x, int y) {
+ return (x << y) | (x >> 64-y);
+}
+unsigned f2(unsigned x, int y){
+ return (x << y) | (x >> 32-y);
+}
+unsigned long long f3(unsigned long long x){
+ int y = 9;
+ return (x << y) | (x >> 64-y);
+}
+unsigned f4(unsigned x){
+ int y = 10;
+ return (x << y) | (x >> 32-y);
+}
+unsigned long long f5(unsigned long long x, unsigned long long y) {
+ return (x << 8) | ((y >> 48) & 0xffull);
+}
+unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
+ switch(z) {
+ case 1:
+ return (x << 8) | ((y >> 48) & 0xffull);
+ case 2:
+ return (x << 16) | ((y >> 40) & 0xffffull);
+ case 3:
+ return (x << 24) | ((y >> 32) & 0xffffffull);
+ case 4:
+ return (x << 32) | ((y >> 24) & 0xffffffffull);
+ default:
+ return (x << 40) | ((y >> 16) & 0xffffffffffull);
+ }
+}
+
+On X86-64, we only handle f3/f4 right. On x86-32, several of these
+generate truly horrible code, instead of using shld and friends. On
+ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
+badness. PPC64 misses f, f5 and f6. CellSPU aborts in isel.
+
+//===---------------------------------------------------------------------===//
+
+We do a number of simplifications in simplify libcalls to strength reduce
+standard library functions, but we don't currently merge them together. For
+example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy. This can only
+be done safely if "b" isn't modified between the strlen and memcpy of course.
+
+//===---------------------------------------------------------------------===//
+
+define i32 @test2(float %X, float %Y) {
+entry:
+ %tmp3 = fcmp uno float %X, %Y ; <i1> [#uses=1]
+ %tmp34 = zext i1 %tmp3 to i8 ; <i8> [#uses=1]
+ %tmp = xor i8 %tmp34, 1 ; <i8> [#uses=1]
+ %toBoolnot5 = zext i8 %tmp to i32 ; <i32> [#uses=1]
+ ret i32 %toBoolnot5
+}
+
+could be optimized further. Instcombine should use its bitwise analysis to
+collapse the zext/xor/zext structure to an xor/zext and then remove the
+xor by reversing the fcmp.
+
+Desired output:
+
+define i32 @test2(float %X, float %Y) {
+entry:
+ %tmp3 = fcmp ord float %X, %Y ; <i1> [#uses=1]
+ %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
+ ret i32 %tmp34
+}
+
+To fix this, we need to make CanEvaluateInDifferentType smarter.
+
+//===---------------------------------------------------------------------===//
+
+We should be able to evaluate this loop:
+
+int test(int x_offs) {
+ while (x_offs > 4)
+ x_offs -= 4;
+ return x_offs;
+}
+
+//===---------------------------------------------------------------------===//