From 0521e3cdc146cdd37768dc44e9069862805f208d Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Wed, 18 Jun 2008 04:33:20 +0000 Subject: [PATCH] implement some simple bswap optimizations, rdar://5992453 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@52442 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/InstructionCombining.cpp | 290 ++++++++++-------- test/Transforms/InstCombine/bswap-fold.ll | 31 +- 2 files changed, 195 insertions(+), 126 deletions(-) diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp index 4af95454939..802102c99a2 100644 --- a/lib/Transforms/Scalar/InstructionCombining.cpp +++ b/lib/Transforms/Scalar/InstructionCombining.cpp @@ -1304,6 +1304,46 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask, KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; break; } + case Instruction::Call: + if (IntrinsicInst *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: { + // If the only bits demanded come from one byte of the bswap result, + // just shift the input byte into position to eliminate the bswap. + unsigned NLZ = DemandedMask.countLeadingZeros(); + unsigned NTZ = DemandedMask.countTrailingZeros(); + + // Round NTZ down to the next byte. If we have 11 trailing zeros, then + // we need all the bits down to bit 8. Likewise, round NLZ. If we + // have 14 leading zeros, round to 8. + NLZ &= ~7; + NTZ &= ~7; + // If we need exactly one byte, we can do this transformation. + if (BitWidth-NLZ-NTZ == 8) { + unsigned ResultBit = NTZ; + unsigned InputBit = BitWidth-NTZ-8; + + // Replace this with either a left or right shift to get the byte into + // the right place. + Instruction *NewVal; + if (InputBit > ResultBit) + NewVal = BinaryOperator::CreateLShr(I->getOperand(1), + ConstantInt::get(I->getType(), InputBit-ResultBit)); + else + NewVal = BinaryOperator::CreateShl(I->getOperand(1), + ConstantInt::get(I->getType(), ResultBit-InputBit)); + NewVal->takeName(I); + InsertNewInstBefore(NewVal, *I); + return UpdateValueUsesWith(I, NewVal); + } + + // TODO: Could compute known zero/one bits based on the input. + break; + } + } + } + break; } // If the client is only demanding bits that we know, return the known @@ -8533,144 +8573,150 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } if (Changed) return II; - } else { - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::ppc_altivec_lvx: - case Intrinsic::ppc_altivec_lvxl: - case Intrinsic::x86_sse_loadu_ps: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse2_loadu_dq: - // Turn PPC lvx -> load if the pointer is known aligned. - // Turn X86 loadups -> load if the pointer is known aligned. - if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) { - Value *Ptr = InsertBitCastBefore(II->getOperand(1), - PointerType::getUnqual(II->getType()), - CI); - return new LoadInst(Ptr); - } - break; - case Intrinsic::ppc_altivec_stvx: - case Intrinsic::ppc_altivec_stvxl: - // Turn stvx -> store if the pointer is known aligned. - if (GetOrEnforceKnownAlignment(II->getOperand(2), 16) >= 16) { - const Type *OpPtrTy = - PointerType::getUnqual(II->getOperand(1)->getType()); - Value *Ptr = InsertBitCastBefore(II->getOperand(2), OpPtrTy, CI); - return new StoreInst(II->getOperand(1), Ptr); - } - break; - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - // Turn X86 storeu -> store if the pointer is known aligned. - if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) { - const Type *OpPtrTy = - PointerType::getUnqual(II->getOperand(2)->getType()); - Value *Ptr = InsertBitCastBefore(II->getOperand(1), OpPtrTy, CI); - return new StoreInst(II->getOperand(2), Ptr); - } - break; + } + + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: + // bswap(bswap(x)) -> x + if (IntrinsicInst *Operand = dyn_cast(II->getOperand(1))) + if (Operand->getIntrinsicID() == Intrinsic::bswap) + return ReplaceInstUsesWith(CI, Operand->getOperand(1)); + break; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::x86_sse_loadu_ps: + case Intrinsic::x86_sse2_loadu_pd: + case Intrinsic::x86_sse2_loadu_dq: + // Turn PPC lvx -> load if the pointer is known aligned. + // Turn X86 loadups -> load if the pointer is known aligned. + if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) { + Value *Ptr = InsertBitCastBefore(II->getOperand(1), + PointerType::getUnqual(II->getType()), + CI); + return new LoadInst(Ptr); + } + break; + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + // Turn stvx -> store if the pointer is known aligned. + if (GetOrEnforceKnownAlignment(II->getOperand(2), 16) >= 16) { + const Type *OpPtrTy = + PointerType::getUnqual(II->getOperand(1)->getType()); + Value *Ptr = InsertBitCastBefore(II->getOperand(2), OpPtrTy, CI); + return new StoreInst(II->getOperand(1), Ptr); + } + break; + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + // Turn X86 storeu -> store if the pointer is known aligned. + if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) { + const Type *OpPtrTy = + PointerType::getUnqual(II->getOperand(2)->getType()); + Value *Ptr = InsertBitCastBefore(II->getOperand(1), OpPtrTy, CI); + return new StoreInst(II->getOperand(2), Ptr); + } + break; + + case Intrinsic::x86_sse_cvttss2si: { + // These intrinsics only demands the 0th element of its input vector. If + // we can simplify the input based on that, do so now. + uint64_t UndefElts; + if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, + UndefElts)) { + II->setOperand(1, V); + return II; + } + break; + } + + case Intrinsic::ppc_altivec_vperm: + // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. + if (ConstantVector *Mask = dyn_cast(II->getOperand(3))) { + assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!"); - case Intrinsic::x86_sse_cvttss2si: { - // These intrinsics only demands the 0th element of its input vector. If - // we can simplify the input based on that, do so now. - uint64_t UndefElts; - if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, - UndefElts)) { - II->setOperand(1, V); - return II; + // Check that all of the elements are integer constants or undefs. + bool AllEltsOk = true; + for (unsigned i = 0; i != 16; ++i) { + if (!isa(Mask->getOperand(i)) && + !isa(Mask->getOperand(i))) { + AllEltsOk = false; + break; + } } - break; - } - case Intrinsic::ppc_altivec_vperm: - // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. - if (ConstantVector *Mask = dyn_cast(II->getOperand(3))) { - assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!"); + if (AllEltsOk) { + // Cast the input vectors to byte vectors. + Value *Op0 =InsertBitCastBefore(II->getOperand(1),Mask->getType(),CI); + Value *Op1 =InsertBitCastBefore(II->getOperand(2),Mask->getType(),CI); + Value *Result = UndefValue::get(Op0->getType()); - // Check that all of the elements are integer constants or undefs. - bool AllEltsOk = true; - for (unsigned i = 0; i != 16; ++i) { - if (!isa(Mask->getOperand(i)) && - !isa(Mask->getOperand(i))) { - AllEltsOk = false; - break; - } - } + // Only extract each element once. + Value *ExtractedElts[32]; + memset(ExtractedElts, 0, sizeof(ExtractedElts)); - if (AllEltsOk) { - // Cast the input vectors to byte vectors. - Value *Op0 =InsertBitCastBefore(II->getOperand(1),Mask->getType(),CI); - Value *Op1 =InsertBitCastBefore(II->getOperand(2),Mask->getType(),CI); - Value *Result = UndefValue::get(Op0->getType()); - - // Only extract each element once. - Value *ExtractedElts[32]; - memset(ExtractedElts, 0, sizeof(ExtractedElts)); - - for (unsigned i = 0; i != 16; ++i) { - if (isa(Mask->getOperand(i))) - continue; - unsigned Idx=cast(Mask->getOperand(i))->getZExtValue(); - Idx &= 31; // Match the hardware behavior. - - if (ExtractedElts[Idx] == 0) { - Instruction *Elt = - new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp"); - InsertNewInstBefore(Elt, CI); - ExtractedElts[Idx] = Elt; - } + for (unsigned i = 0; i != 16; ++i) { + if (isa(Mask->getOperand(i))) + continue; + unsigned Idx=cast(Mask->getOperand(i))->getZExtValue(); + Idx &= 31; // Match the hardware behavior. - // Insert this value into the result vector. - Result = InsertElementInst::Create(Result, ExtractedElts[Idx], - i, "tmp"); - InsertNewInstBefore(cast(Result), CI); + if (ExtractedElts[Idx] == 0) { + Instruction *Elt = + new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp"); + InsertNewInstBefore(Elt, CI); + ExtractedElts[Idx] = Elt; } - return CastInst::Create(Instruction::BitCast, Result, CI.getType()); + + // Insert this value into the result vector. + Result = InsertElementInst::Create(Result, ExtractedElts[Idx], + i, "tmp"); + InsertNewInstBefore(cast(Result), CI); } + return CastInst::Create(Instruction::BitCast, Result, CI.getType()); } - break; + } + break; - case Intrinsic::stackrestore: { - // If the save is right next to the restore, remove the restore. This can - // happen when variable allocas are DCE'd. - if (IntrinsicInst *SS = dyn_cast(II->getOperand(1))) { - if (SS->getIntrinsicID() == Intrinsic::stacksave) { - BasicBlock::iterator BI = SS; - if (&*++BI == II) - return EraseInstFromFunction(CI); - } + case Intrinsic::stackrestore: { + // If the save is right next to the restore, remove the restore. This can + // happen when variable allocas are DCE'd. + if (IntrinsicInst *SS = dyn_cast(II->getOperand(1))) { + if (SS->getIntrinsicID() == Intrinsic::stacksave) { + BasicBlock::iterator BI = SS; + if (&*++BI == II) + return EraseInstFromFunction(CI); } - - // Scan down this block to see if there is another stack restore in the - // same block without an intervening call/alloca. - BasicBlock::iterator BI = II; - TerminatorInst *TI = II->getParent()->getTerminator(); - bool CannotRemove = false; - for (++BI; &*BI != TI; ++BI) { - if (isa(BI)) { + } + + // Scan down this block to see if there is another stack restore in the + // same block without an intervening call/alloca. + BasicBlock::iterator BI = II; + TerminatorInst *TI = II->getParent()->getTerminator(); + bool CannotRemove = false; + for (++BI; &*BI != TI; ++BI) { + if (isa(BI)) { + CannotRemove = true; + break; + } + if (isa(BI)) { + if (!isa(BI)) { CannotRemove = true; break; } - if (isa(BI)) { - if (!isa(BI)) { - CannotRemove = true; - break; - } - // If there is a stackrestore below this one, remove this one. - return EraseInstFromFunction(CI); - } - } - - // If the stack restore is in a return/unwind block and if there are no - // allocas or calls between the restore and the return, nuke the restore. - if (!CannotRemove && (isa(TI) || isa(TI))) + // If there is a stackrestore below this one, remove this one. return EraseInstFromFunction(CI); - break; - } + } } + + // If the stack restore is in a return/unwind block and if there are no + // allocas or calls between the restore and the return, nuke the restore. + if (!CannotRemove && (isa(TI) || isa(TI))) + return EraseInstFromFunction(CI); + break; + } } return visitCallSite(II); diff --git a/test/Transforms/InstCombine/bswap-fold.ll b/test/Transforms/InstCombine/bswap-fold.ll index 3d354a10216..87d8b0496d2 100644 --- a/test/Transforms/InstCombine/bswap-fold.ll +++ b/test/Transforms/InstCombine/bswap-fold.ll @@ -1,7 +1,5 @@ -; RUN: llvm-as < %s | opt -instcombine | llvm-dis | \ -; RUN: grep ret | count 3 -; RUN: llvm-as < %s | opt -instcombine | llvm-dis | \ -; RUN: not grep call.*bswap +; RUN: llvm-as < %s | opt -instcombine | llvm-dis | grep ret | count 6 +; RUN: llvm-as < %s | opt -instcombine | llvm-dis | not grep call.*bswap define i1 @test1(i16 %tmp2) { %tmp10 = call i16 @llvm.bswap.i16( i16 %tmp2 ) ; [#uses=1] @@ -27,3 +25,28 @@ declare i64 @llvm.bswap.i64(i64) declare i16 @llvm.bswap.i16(i16) +; rdar://5992453 +; A & 255 +define i32 @test4(i32 %a) nounwind { +entry: + %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a ) + %tmp4 = lshr i32 %tmp2, 24 + ret i32 %tmp4 +} + +; A +define i32 @test5(i32 %a) nounwind { +entry: + %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a ) + %tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 ) + ret i32 %tmp4 +} + +; a >> 24 +define i32 @test6(i32 %a) nounwind { +entry: + %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a ) + %tmp4 = and i32 %tmp2, 255 + ret i32 %tmp4 +} + -- 2.34.1