X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FREADME-Thumb.txt;h=6b605bb0a7cf63ec8d9c893d5ef7f2621c7c2071;hb=6935efcb667bcd4dd3a00bbd420461e1fadba73a;hp=3818d831461901af3b8003cb4ffc1bf2435f5c04;hpb=2ef02a220e3f949ebd18948ebb5bea34dc18b652;p=oota-llvm.git diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt index 3818d831461..6b605bb0a7c 100644 --- a/lib/Target/ARM/README-Thumb.txt +++ b/lib/Target/ARM/README-Thumb.txt @@ -4,6 +4,7 @@ * Add support for compiling functions in both ARM and Thumb mode, then taking the smallest. + * Add support for compiling individual basic blocks in thumb mode, when in a larger ARM function. This can be used for presumed cold code, like paths to abort (failure path of asserts), EH handling code, etc. @@ -36,7 +37,7 @@ LPCRELL0: mov r1, #PCRELV0 add r1, pc ldr r0, [r0, r1] - cpy pc, r0 + mov pc, r0 .align 2 LJTI1_0_0: .long LBB1_3 @@ -50,7 +51,7 @@ We should be able to generate: LPCRELL0: add r1, LJTI1_0_0 ldr r0, [r0, r1] - cpy pc, r0 + mov pc, r0 .align 2 LJTI1_0_0: .long LBB1_3 @@ -67,25 +68,9 @@ LPCRELL0: //===---------------------------------------------------------------------===// -We compiles the following using a jump table. +We compiles the following: define i16 @func_entry_2E_ce(i32 %i) { -newFuncRoot: - br label %entry.ce - -bb12.exitStub: ; preds = %entry.ce - ret i16 0 - -bb4.exitStub: ; preds = %entry.ce, %entry.ce, %entry.ce - ret i16 1 - -bb9.exitStub: ; preds = %entry.ce, %entry.ce, %entry.ce - ret i16 2 - -bb.exitStub: ; preds = %entry.ce - ret i16 3 - -entry.ce: ; preds = %newFuncRoot switch i32 %i, label %bb12.exitStub [ i32 0, label %bb4.exitStub i32 1, label %bb9.exitStub @@ -95,8 +80,58 @@ entry.ce: ; preds = %newFuncRoot i32 8, label %bb.exitStub i32 9, label %bb9.exitStub ] + +bb12.exitStub: + ret i16 0 + +bb4.exitStub: + ret i16 1 + +bb9.exitStub: + ret i16 2 + +bb.exitStub: + ret i16 3 } +into: + +_func_entry_2E_ce: + mov r2, #1 + lsl r2, r0 + cmp r0, #9 + bhi LBB1_4 @bb12.exitStub +LBB1_1: @newFuncRoot + mov r1, #13 + tst r2, r1 + bne LBB1_5 @bb4.exitStub +LBB1_2: @newFuncRoot + ldr r1, LCPI1_0 + tst r2, r1 + bne LBB1_6 @bb9.exitStub +LBB1_3: @newFuncRoot + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 + bne LBB1_7 @bb.exitStub +LBB1_4: @bb12.exitStub + mov r0, #0 + bx lr +LBB1_5: @bb4.exitStub + mov r0, #1 + bx lr +LBB1_6: @bb9.exitStub + mov r0, #2 + bx lr +LBB1_7: @bb.exitStub + mov r0, #3 + bx lr +LBB1_8: + .align 2 +LCPI1_0: + .long 642 + + gcc compiles to: cmp r0, #9 @@ -124,6 +159,21 @@ L12: .align 2 L11: .long 642 + + +GCC is doing a couple of clever things here: + 1. It is predicating one of the returns. This isn't a clear win though: in + cases where that return isn't taken, it is replacing one condbranch with + two 'ne' predicated instructions. + 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of + tst. This will probably require whole function isel. + 3. GCC emits: + tst r1, #256 + we emit: + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 + //===---------------------------------------------------------------------===// @@ -146,11 +196,53 @@ This is especially bad when dynamic alloca is used. The all fixed size stack objects are referenced off the frame pointer with negative offsets. See oggenc for an example. + +//===---------------------------------------------------------------------===// + +Poor codegen test/CodeGen/ARM/select.ll f7: + + ldr r5, LCPI1_0 +LPC0: + add r5, pc + ldr r6, LCPI1_1 + ldr r2, LCPI1_2 + mov r3, r6 + mov lr, pc + bx r5 + +//===---------------------------------------------------------------------===// + +Make register allocator / spiller smarter so we can re-materialize "mov r, imm", +etc. Almost all Thumb instructions clobber condition code. + +//===---------------------------------------------------------------------===// + +Add ldmia, stmia support. + +//===---------------------------------------------------------------------===// + +Thumb load / store address mode offsets are scaled. The values kept in the +instruction operands are pre-scale values. This probably ought to be changed +to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions. + +//===---------------------------------------------------------------------===// + +We need to make (some of the) Thumb1 instructions predicable. That will allow +shrinking of predicated Thumb2 instructions. To allow this, we need to be able +to toggle the 's' bit since they do not set CPSR when they are inside IT blocks. + +//===---------------------------------------------------------------------===// + +Make use of hi register variants of cmp: tCMPhir / tCMPZhir. + +//===---------------------------------------------------------------------===// + +Thumb1 immediate field sometimes keep pre-scaled values. See +Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and +Thumb2. + //===---------------------------------------------------------------------===// -We are reserving R3 as a scratch register under thumb mode. So if it is live in -to the function, we save / restore R3 to / from R12. Until register scavenging -is done, we should save R3 to a high callee saved reg at emitPrologue time -(when hasFP is true or stack size is large) and restore R3 from that register -instead. This allows us to at least get rid of the save to r12 everytime it is -used. +Rather than having tBR_JTr print a ".align 2" and constant island pass pad it, +add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes +won't have to over-estimate. It can also be used for loop alignment pass.