+We could also strength reduce the rem and the div:
+http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
+
+===-------------------------------------------------------------------------===
+
+float foo(float X) { return (int)(X); }
+
+Currently produces:
+
+_foo:
+ fctiwz f0, f1
+ stfd f0, -8(r1)
+ lwz r2, -4(r1)
+ extsw r2, r2
+ std r2, -16(r1)
+ lfd f0, -16(r1)
+ fcfid f0, f0
+ frsp f1, f0
+ blr
+
+We could use a target dag combine to turn the lwz/extsw into an lwa when the
+lwz has a single use. Since LWA is cracked anyway, this would be a codesize
+win only.
+
+===-------------------------------------------------------------------------===
+
+We generate ugly code for this:
+
+void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
+ unsigned code = 0;
+ if(dx < -dw) code |= 1;
+ if(dx > dw) code |= 2;
+ if(dy < -dw) code |= 4;
+ if(dy > dw) code |= 8;
+ if(dz < -dw) code |= 16;
+ if(dz > dw) code |= 32;
+ *ret = code;
+}
+
+===-------------------------------------------------------------------------===
+
+Complete the signed i32 to FP conversion code using 64-bit registers
+transformation, good for PI. See PPCISelLowering.cpp, this comment:
+
+ // FIXME: disable this lowered code. This generates 64-bit register values,
+ // and we don't model the fact that the top part is clobbered by calls. We
+ // need to flag these together so that the value isn't live across a call.
+ //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+Also, if the registers are spilled to the stack, we have to ensure that all
+64-bits of them are save/restored, otherwise we will miscompile the code. It
+sounds like we need to get the 64-bit register classes going.
+
+===-------------------------------------------------------------------------===
+
+%struct.B = type { i8, [3 x i8] }
+
+define void @bar(%struct.B* %b) {
+entry:
+ %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
+ %tmp = load i32* %tmp ; <uint> [#uses=1]
+ %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
+ %tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
+ %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
+ %tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
+ %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
+ %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
+ %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
+ %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
+ %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
+ %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
+ store i32 %tmp13, i32* %tmp8
+ ret void
+}
+
+We emit:
+
+_foo:
+ lwz r2, 0(r3)
+ slwi r4, r2, 1
+ or r4, r4, r2
+ rlwimi r2, r4, 0, 0, 0
+ stw r2, 0(r3)
+ blr
+
+We could collapse a bunch of those ORs and ANDs and generate the following
+equivalent code:
+
+_foo:
+ lwz r2, 0(r3)
+ rlwinm r4, r2, 1, 0, 0
+ or r2, r2, r4
+ stw r2, 0(r3)
+ blr
+
+===-------------------------------------------------------------------------===
+
+We compile:
+
+unsigned test6(unsigned x) {
+ return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
+}
+
+into:
+
+_test6:
+ lis r2, 255
+ rlwinm r3, r3, 16, 0, 31
+ ori r2, r2, 255
+ and r3, r3, r2
+ blr
+
+GCC gets it down to:
+
+_test6:
+ rlwinm r0,r3,16,8,15
+ rlwinm r3,r3,16,24,31
+ or r3,r3,r0
+ blr
+
+
+===-------------------------------------------------------------------------===
+
+Consider a function like this:
+
+float foo(float X) { return X + 1234.4123f; }
+
+The FP constant ends up in the constant pool, so we need to get the LR register.
+ This ends up producing code like this:
+
+_foo:
+.LBB_foo_0: ; entry
+ mflr r11
+*** stw r11, 8(r1)
+ bl "L00000$pb"
+"L00000$pb":
+ mflr r2
+ addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
+ lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
+ fadds f1, f1, f0
+*** lwz r11, 8(r1)
+ mtlr r11
+ blr
+
+This is functional, but there is no reason to spill the LR register all the way
+to the stack (the two marked instrs): spilling it to a GPR is quite enough.
+
+Implementing this will require some codegen improvements. Nate writes:
+
+"So basically what we need to support the "no stack frame save and restore" is a
+generalization of the LR optimization to "callee-save regs".
+
+Currently, we have LR marked as a callee-save reg. The register allocator sees
+that it's callee save, and spills it directly to the stack.
+
+Ideally, something like this would happen:
+
+LR would be in a separate register class from the GPRs. The class of LR would be
+marked "unspillable". When the register allocator came across an unspillable
+reg, it would ask "what is the best class to copy this into that I *can* spill"
+If it gets a class back, which it will in this case (the gprs), it grabs a free
+register of that class. If it is then later necessary to spill that reg, so be
+it.
+
+===-------------------------------------------------------------------------===
+
+We compile this:
+int test(_Bool X) {
+ return X ? 524288 : 0;
+}
+
+to:
+_test:
+ cmplwi cr0, r3, 0
+ lis r2, 8
+ li r3, 0
+ beq cr0, LBB1_2 ;entry
+LBB1_1: ;entry
+ mr r3, r2
+LBB1_2: ;entry
+ blr
+
+instead of:
+_test:
+ addic r2,r3,-1
+ subfe r0,r2,r3
+ slwi r3,r0,19
+ blr
+
+This sort of thing occurs a lot due to globalopt.
+
+===-------------------------------------------------------------------------===
+
+We compile:
+
+define i32 @bar(i32 %x) nounwind readnone ssp {
+entry:
+ %0 = icmp eq i32 %x, 0 ; <i1> [#uses=1]
+ %neg = sext i1 %0 to i32 ; <i32> [#uses=1]
+ ret i32 %neg
+}
+
+to:
+
+_bar:
+ cntlzw r2, r3
+ slwi r2, r2, 26
+ srawi r3, r2, 31
+ blr
+
+it would be better to produce:
+
+_bar:
+ addic r3,r3,-1
+ subfe r3,r3,r3
+ blr
+
+===-------------------------------------------------------------------------===
+
+We currently compile 32-bit bswap:
+
+declare i32 @llvm.bswap.i32(i32 %A)
+define i32 @test(i32 %A) {
+ %B = call i32 @llvm.bswap.i32(i32 %A)
+ ret i32 %B
+}
+
+to:
+
+_test:
+ rlwinm r2, r3, 24, 16, 23
+ slwi r4, r3, 24
+ rlwimi r2, r3, 8, 24, 31
+ rlwimi r4, r3, 8, 8, 15
+ rlwimi r4, r2, 0, 16, 31
+ mr r3, r4
+ blr
+
+it would be more efficient to produce:
+
+_foo: mr r0,r3
+ rlwinm r3,r3,8,0xffffffff
+ rlwimi r3,r0,24,0,7
+ rlwimi r3,r0,24,16,23
+ blr
+
+===-------------------------------------------------------------------------===
+
+test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
+
+__ZNK4llvm5APInt17countLeadingZerosEv:
+ ld r2, 0(r3)
+ cntlzd r2, r2
+ or r2, r2, r2 <<-- silly.
+ addi r3, r2, -64
+ blr
+
+The dead or is a 'truncate' from 64- to 32-bits.
+
+===-------------------------------------------------------------------------===
+
+We generate horrible ppc code for this:
+
+#define N 2000000
+double a[N],c[N];
+void simpleloop() {
+ int j;
+ for (j=0; j<N; j++)
+ c[j] = a[j];
+}
+
+LBB1_1: ;bb
+ lfdx f0, r3, r4
+ addi r5, r5, 1 ;; Extra IV for the exit value compare.
+ stfdx f0, r2, r4
+ addi r4, r4, 8