"computations were sunk");
STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");
STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");
+STATISTIC(NumAndsAdded,
+ "Number of and mask instructions added to form ext loads");
+STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
STATISTIC(NumRetsDup, "Number of return instructions duplicated");
STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
bool moveExtToFormExtLoad(Instruction *&I);
bool optimizeExtUses(Instruction *I);
+ bool optimizeLoadExt(LoadInst *I);
bool optimizeSelectInst(SelectInst *SI);
bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
bool optimizeSwitchInst(SwitchInst *CI);
return MadeChange;
}
+// Find loads whose uses only use some of the loaded value's bits. Add an "and"
+// just after the load if the target can fold this into one extload instruction,
+// with the hope of eliminating some of the other later "and" instructions using
+// the loaded value. "and"s that are made trivially redundant by the insertion
+// of the new "and" are removed by this function, while others (e.g. those whose
+// path from the load goes through a phi) are left for isel to potentially
+// remove.
+//
+// For example:
+//
+// b0:
+// x = load i32
+// ...
+// b1:
+// y = and x, 0xff
+// z = use y
+//
+// becomes:
+//
+// b0:
+// x = load i32
+// x' = and x, 0xff
+// ...
+// b1:
+// z = use x'
+//
+// whereas:
+//
+// b0:
+// x1 = load i32
+// ...
+// b1:
+// x2 = load i32
+// ...
+// b2:
+// x = phi x1, x2
+// y = and x, 0xff
+//
+// becomes (after a call to optimizeLoadExt for each load):
+//
+// b0:
+// x1 = load i32
+// x1' = and x1, 0xff
+// ...
+// b1:
+// x2 = load i32
+// x2' = and x2, 0xff
+// ...
+// b2:
+// x = phi x1', x2'
+// y = and x, 0xff
+//
+
+bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
+
+ if (!Load->isSimple() ||
+ !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
+ return false;
+
+ // Skip loads we've already transformed or have no reason to transform.
+ if (Load->hasOneUse()) {
+ User *LoadUser = *Load->user_begin();
+ if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
+ !dyn_cast<PHINode>(LoadUser))
+ return false;
+ }
+
+ // Look at all uses of Load, looking through phis, to determine how many bits
+ // of the loaded value are needed.
+ SmallVector<Instruction *, 8> WorkList;
+ SmallPtrSet<Instruction *, 16> Visited;
+ SmallVector<Instruction *, 8> AndsToMaybeRemove;
+ for (auto *U : Load->users())
+ WorkList.push_back(cast<Instruction>(U));
+
+ EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
+ unsigned BitWidth = LoadResultVT.getSizeInBits();
+ APInt DemandBits(BitWidth, 0);
+ APInt WidestAndBits(BitWidth, 0);
+
+ while (!WorkList.empty()) {
+ Instruction *I = WorkList.back();
+ WorkList.pop_back();
+
+ // Break use-def graph loops.
+ if (!Visited.insert(I).second)
+ continue;
+
+ // For a PHI node, push all of its users.
+ if (auto *Phi = dyn_cast<PHINode>(I)) {
+ for (auto *U : Phi->users())
+ WorkList.push_back(cast<Instruction>(U));
+ continue;
+ }
+
+ switch (I->getOpcode()) {
+ case llvm::Instruction::And: {
+ auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!AndC)
+ return false;
+ APInt AndBits = AndC->getValue();
+ DemandBits |= AndBits;
+ // Keep track of the widest and mask we see.
+ if (AndBits.ugt(WidestAndBits))
+ WidestAndBits = AndBits;
+ if (AndBits == WidestAndBits && I->getOperand(0) == Load)
+ AndsToMaybeRemove.push_back(I);
+ break;
+ }
+
+ case llvm::Instruction::Shl: {
+ auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!ShlC)
+ return false;
+ uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
+ auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt);
+ DemandBits |= ShlDemandBits;
+ break;
+ }
+
+ case llvm::Instruction::Trunc: {
+ EVT TruncVT = TLI->getValueType(*DL, I->getType());
+ unsigned TruncBitWidth = TruncVT.getSizeInBits();
+ auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth);
+ DemandBits |= TruncBits;
+ break;
+ }
+
+ default:
+ return false;
+ }
+ }
+
+ uint32_t ActiveBits = DemandBits.getActiveBits();
+ // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
+ // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example,
+ // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
+ // (and (load x) 1) is not matched as a single instruction, rather as a LDR
+ // followed by an AND.
+ // TODO: Look into removing this restriction by fixing backends to either
+ // return false for isLoadExtLegal for i1 or have them select this pattern to
+ // a single instruction.
+ //
+ // Also avoid hoisting if we didn't see any ands with the exact DemandBits
+ // mask, since these are the only ands that will be removed by isel.
+ if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
+ WidestAndBits != DemandBits)
+ return false;
+
+ LLVMContext &Ctx = Load->getType()->getContext();
+ Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
+ EVT TruncVT = TLI->getValueType(*DL, TruncTy);
+
+ // Reject cases that won't be matched as extloads.
+ if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
+ !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
+ return false;
+
+ IRBuilder<> Builder(Load->getNextNode());
+ auto *NewAnd = dyn_cast<Instruction>(
+ Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+
+ // Replace all uses of load with new and (except for the use of load in the
+ // new and itself).
+ Load->replaceAllUsesWith(NewAnd);
+ NewAnd->setOperand(0, Load);
+
+ // Remove any and instructions that are now redundant.
+ for (auto *And : AndsToMaybeRemove)
+ // Check that the and mask is the same as the one we decided to put on the
+ // new and.
+ if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
+ And->replaceAllUsesWith(NewAnd);
+ if (&*CurInstIterator == And)
+ CurInstIterator = std::next(And->getIterator());
+ And->eraseFromParent();
+ ++NumAndUses;
+ }
+
+ ++NumAndsAdded;
+ return true;
+}
+
/// Check if V (an operand of a select instruction) is an expensive instruction
/// that is only used once.
static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
stripInvariantGroupMetadata(*LI);
if (TLI) {
+ bool Modified = optimizeLoadExt(LI);
unsigned AS = LI->getPointerAddressSpace();
- return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+ Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+ return Modified;
}
return false;
}
store i64 %load64, i64* %dst2, align 8
ret void
}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
+; feeding a phi that zext's each loaded value.
+define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) {
+; CHECK-LABEL: test_free_zext3:
+bb1:
+; CHECK: ldrh [[REG:w[0-9]+]]
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+ %tmp1 = load i32, i32* %ptr, align 4
+ %cmp = icmp ne i32 %c, 0
+ br i1 %cmp, label %bb2, label %bb3
+bb2:
+; CHECK: ldrh [[REG2:w[0-9]+]]
+; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff
+ %tmp2 = load i32, i32* %ptr2, align 4
+ br label %bb3
+bb3:
+ %tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
+; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+ %tmpand = and i32 %tmp3, 65535
+ ret i32 %tmpand
+}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
+; load feeding a phi in the same block.
+define void @test_free_zext4(i32* %ptr, i32* %ptr2, i32* %dst) {
+; CHECK-LABEL: test_free_zext4:
+; CHECK: ldrh [[REG:w[0-9]+]]
+; TODO: fix isel to remove final and XCHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+; CHECK: ldrh [[REG:w[0-9]+]]
+bb1:
+ %load1 = load i32, i32* %ptr, align 4
+ br label %loop
+loop:
+ %phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
+ %and = and i32 %phi, 65535
+ store i32 %and, i32* %dst, align 4
+ %load2 = load i32, i32* %ptr2, align 4
+ %cmp = icmp ne i32 %and, 0
+ br i1 %cmp, label %loop, label %end
+end:
+ ret void
+}
--- /dev/null
+; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck %s
+
+; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
+; feeding a phi that zext's each loaded value.
+define i32 @test_free_zext(i32* %ptr, i32* %ptr2, i32 %c) {
+; CHECK-LABEL: @test_free_zext(
+bb1:
+; CHECK-LABEL: bb1:
+; CHECK: %[[T1:.*]] = load
+; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
+ %load1 = load i32, i32* %ptr, align 4
+ %cmp = icmp ne i32 %c, 0
+ br i1 %cmp, label %bb2, label %bb3
+bb2:
+; CHECK-LABEL: bb2:
+; CHECK: %[[T2:.*]] = load
+; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535
+ %load2 = load i32, i32* %ptr2, align 4
+ br label %bb3
+bb3:
+; CHECK-LABEL: bb3:
+; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %bb2 ]
+ %phi = phi i32 [ %load1, %bb1 ], [ %load2, %bb2 ]
+ %and = and i32 %phi, 65535
+ ret i32 %and
+}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): exercise all opcode
+; cases of active bit calculation.
+define i32 @test_free_zext2(i32* %ptr, i16* %dst16, i32* %dst32, i32 %c) {
+; CHECK-LABEL: @test_free_zext2(
+bb1:
+; CHECK-LABEL: bb1:
+; CHECK: %[[T1:.*]] = load
+; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
+ %load1 = load i32, i32* %ptr, align 4
+ %cmp = icmp ne i32 %c, 0
+ br i1 %cmp, label %bb2, label %bb4
+bb2:
+; CHECK-LABEL: bb2:
+ %trunc = trunc i32 %load1 to i16
+ store i16 %trunc, i16* %dst16, align 2
+ br i1 %cmp, label %bb3, label %bb4
+bb3:
+; CHECK-LABEL: bb3:
+ %shl = shl i32 %load1, 16
+ store i32 %shl, i32* %dst32, align 4
+ br label %bb4
+bb4:
+; CHECK-LABEL: bb4:
+; CHECK-NOT: and
+; CHECK: ret i32 %[[A1]]
+ %and = and i32 %load1, 65535
+ ret i32 %and
+}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
+; load feeding a phi in the same block.
+define void @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i64* %c) {
+; CHECK-LABEL: @test_free_zext3(
+bb1:
+; CHECK-LABEL: bb1:
+; CHECK: %[[T1:.*]] = load
+; CHECK: %[[A1:.*]] = and i32 %[[T1]], 65535
+ %load1 = load i32, i32* %ptr, align 4
+ br label %loop
+loop:
+; CHECK-LABEL: loop:
+; CHECK: phi i32 [ %[[A1]], %bb1 ], [ %[[A2]], %loop ]
+ %phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
+ %and = and i32 %phi, 65535
+ store i32 %and, i32* %dst, align 4
+ %idx = load volatile i64, i64* %c, align 4
+ %addr = getelementptr inbounds i32, i32* %ptr2, i64 %idx
+; CHECK: %[[T2:.*]] = load i32
+; CHECK: %[[A2:.*]] = and i32 %[[T2]], 65535
+ %load2 = load i32, i32* %addr, align 4
+ %cmp = icmp ne i64 %idx, 0
+ br i1 %cmp, label %loop, label %end
+end:
+ ret void
+}