From: Tim Northover Date: Thu, 24 Apr 2014 12:11:53 +0000 (+0000) Subject: AArch64/ARM64: implement BFI optimisation X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=b62ba5eca09b8bf196093279f6c8911d5f90c7cc;p=oota-llvm.git AArch64/ARM64: implement BFI optimisation ARM64 was not producing pure BFI instructions for bitfield insertion operations, unlike AArch64. The approach had to be a little different (in ISelDAGToDAG rather than ISelLowering), and the outcomes aren't identical but hopefully this gives it similar power. This should address PR19424. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207102 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp index 6c4d175b883..6b68f3db525 100644 --- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp +++ b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp @@ -1394,28 +1394,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3); } -// Is mask a i32 or i64 binary sequence 1..10..0 and -// CountTrailingZeros(mask) == ExpectedTrailingZeros -static bool isHighMask(uint64_t Mask, unsigned ExpectedTrailingZeros, - unsigned NumberOfIgnoredHighBits, EVT VT) { +/// Does DstMask form a complementary pair with the mask provided by +/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking, +/// this asks whether DstMask zeroes precisely those bits that will be set by +/// the other half. +static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted, + unsigned NumberOfIgnoredHighBits, EVT VT) { assert((VT == MVT::i32 || VT == MVT::i64) && "i32 or i64 mask type expected!"); + unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits; + APInt SignificantBits = + ~APInt::getHighBitsSet(BitWidth, NumberOfIgnoredHighBits); - uint64_t ExpectedMask; - if (VT == MVT::i32) { - uint32_t ExpectedMaski32 = ~0 << ExpectedTrailingZeros; - ExpectedMask = ExpectedMaski32; - if (NumberOfIgnoredHighBits) { - uint32_t highMask = ~0 << (32 - NumberOfIgnoredHighBits); - Mask |= highMask; - } - } else { - ExpectedMask = ((uint64_t) ~0) << ExpectedTrailingZeros; - if (NumberOfIgnoredHighBits) - Mask |= ((uint64_t) ~0) << (64 - NumberOfIgnoredHighBits); - } + APInt SignificantDstMask = APInt(BitWidth, DstMask); + APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth); - return Mask == ExpectedMask; + return (SignificantDstMask & SignificantBitsToBeInserted) == 0 && + (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue(); } // Look for bits that will be useful for later uses. @@ -1593,6 +1588,79 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) { UsefulBits &= UsersUsefulBits; } +/// Create a machine node performing a notional SHL of Op by ShlAmount. If +/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is +/// 0, return Op unchanged. +static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) { + if (ShlAmount == 0) + return Op; + + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + unsigned UBFMOpc = BitWidth == 32 ? ARM64::UBFMWri : ARM64::UBFMXri; + + SDNode *ShiftNode; + if (ShlAmount > 0) { + // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt + ShiftNode = CurDAG->getMachineNode( + UBFMOpc, SDLoc(Op), VT, Op, + CurDAG->getTargetConstant(BitWidth - ShlAmount, VT), + CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT)); + } else { + // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1 + assert(ShlAmount < 0 && "expected right shift"); + int ShrAmount = -ShlAmount; + ShiftNode = CurDAG->getMachineNode( + UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT), + CurDAG->getTargetConstant(BitWidth - 1, VT)); + } + + return SDValue(ShiftNode, 0); +} + +/// Does this tree qualify as an attempt to move a bitfield into position, +/// essentially "(and (shl VAL, N), Mask)". +static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, + SDValue &Src, int &ShiftAmount, + int &MaskWidth) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + assert(BitWidth == 32 || BitWidth == 64); + + APInt KnownZero, KnownOne; + CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne); + + // Non-zero in the sense that they're not provably zero, which is the key + // point if we want to use this value + uint64_t NonZeroBits = (~KnownZero).getZExtValue(); + + // Discard a constant AND mask if present. It's safe because the node will + // already have been factored into the ComputeMaskedBits calculation above. + uint64_t AndImm; + if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) { + assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0); + Op = Op.getOperand(0); + } + + uint64_t ShlImm; + if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm)) + return false; + Op = Op.getOperand(0); + + if (!isShiftedMask_64(NonZeroBits)) + return false; + + ShiftAmount = countTrailingZeros(NonZeroBits); + MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount); + + // BFI encompasses sufficiently many nodes that it's worth inserting an extra + // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL + // amount. + Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount); + + return true; +} + // Given a OR operation, check if we have the following pattern // ubfm c, b, imm, imm2 (or something that does the same jobs, see // isBitfieldExtractOp) @@ -1602,9 +1670,9 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) { // if yes, given reference arguments will be update so that one can replace // the OR instruction with: // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 -static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0, - SDValue &Opd1, unsigned &LSB, - unsigned &MSB, SelectionDAG *CurDAG) { +static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, + SDValue &Src, unsigned &ImmR, + unsigned &ImmS, SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); // Set Opc @@ -1632,30 +1700,36 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0, for (int i = 0; i < 2; ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) { unsigned BFXOpc; - // Set Opd1, LSB and MSB arguments by looking for - // c = ubfm b, imm, imm2 - if (!isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Opd1, LSB, MSB, - NumberOfIgnoredLowBits, true)) - continue; - - // Check that the returned opcode is compatible with the pattern, - // i.e., same type and zero extended (U and not S) - if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) || - (BFXOpc != ARM64::UBFMWri && VT == MVT::i32)) - continue; - - // Compute the width of the bitfield insertion - int sMSB = MSB - LSB + 1; - // FIXME: This constraints is to catch bitfield insertion we may - // want to widen the pattern if we want to grab general bitfied - // move case - if (sMSB <= 0) + int DstLSB, Width; + if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS, + NumberOfIgnoredLowBits, true)) { + // Check that the returned opcode is compatible with the pattern, + // i.e., same type and zero extended (U and not S) + if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) || + (BFXOpc != ARM64::UBFMWri && VT == MVT::i32)) + continue; + + // Compute the width of the bitfield insertion + DstLSB = 0; + Width = ImmS - ImmR + 1; + // FIXME: This constraint is to catch bitfield insertion we may + // want to widen the pattern if we want to grab general bitfied + // move case + if (Width <= 0) + continue; + + // If the mask on the insertee is correct, we have a BFXIL operation. We + // can share the ImmR and ImmS values from the already-computed UBFM. + } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src, + DstLSB, Width)) { + ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + ImmS = Width - 1; + } else continue; // Check the second part of the pattern EVT VT = OrOpd1->getValueType(0); - if (VT != MVT::i32 && VT != MVT::i64) - continue; + assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand"); // Compute the Known Zero for the candidate of the first operand. // This allows to catch more general case than just looking for @@ -1666,19 +1740,22 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0, // Check if there is enough room for the second operand to appear // in the first one - if (KnownZero.countTrailingOnes() < (unsigned)sMSB) + APInt BitsToBeInserted = + APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width); + + if ((BitsToBeInserted & ~KnownZero) != 0) continue; // Set the first operand uint64_t Imm; if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) && - isHighMask(Imm, sMSB, NumberOfIgnoredHighBits, VT)) + isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT)) // In that case, we can eliminate the AND - Opd0 = OrOpd1->getOperand(0); + Dst = OrOpd1->getOperand(0); else // Maybe the AND has been removed by simplify-demanded-bits // or is useful because it discards more bits - Opd0 = OrOpd1Val; + Dst = OrOpd1Val; // both parts match return true; diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll index 1f046087abc..47aa5b09572 100644 --- a/test/CodeGen/AArch64/bitfield-insert.ll +++ b/test/CodeGen/AArch64/bitfield-insert.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64 +; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64 ; First, a simple example from Clang. The registers could plausibly be ; different, but probably won't be. @@ -7,8 +8,10 @@ define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone { ; CHECK-LABEL: from_clang: -; CHECK: bfi w0, w1, #3, #4 -; CHECK-NEXT: ret +; CHECK-AARCH64: bfi w0, w1, #3, #4 +; CHECK-ARCH64-NEXT: ret + +; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3 entry: %f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0 @@ -26,7 +29,9 @@ entry: define void @test_whole32(i32* %existing, i32* %new) { ; CHECK-LABEL: test_whole32: -; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5 + +; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5 +; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #6, #4 %oldval = load volatile i32* %existing %oldval_keep = and i32 %oldval, 2214592511 ; =0x83ffffff @@ -43,7 +48,8 @@ define void @test_whole32(i32* %existing, i32* %new) { define void @test_whole64(i64* %existing, i64* %new) { ; CHECK-LABEL: test_whole64: -; CHECK: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14 +; CHECK-AARCH64: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14 +; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #38, #13 ; CHECK-NOT: and ; CHECK: ret @@ -62,8 +68,11 @@ define void @test_whole64(i64* %existing, i64* %new) { define void @test_whole32_from64(i64* %existing, i64* %new) { ; CHECK-LABEL: test_whole32_from64: -; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16 -; CHECK-NOT: and + +; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16 +; CHECK-AARCH64-NOT: and +; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #0, #15 + ; CHECK: ret %oldval = load volatile i64* %existing @@ -80,8 +89,12 @@ define void @test_whole32_from64(i64* %existing, i64* %new) { define void @test_32bit_masked(i32 *%existing, i32 *%new) { ; CHECK-LABEL: test_32bit_masked: -; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4 -; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff + +; CHECK-AARCH64: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4 +; CHECK-AARCH64: and {{w[0-9]+}}, [[INSERT]], #0xff + +; CHECK-ARM64: and +; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3 %oldval = load volatile i32* %existing %oldval_keep = and i32 %oldval, 135 ; = 0x87 @@ -98,8 +111,11 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) { define void @test_64bit_masked(i64 *%existing, i64 *%new) { ; CHECK-LABEL: test_64bit_masked: -; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8 -; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000 +; CHECK-AARCH64: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8 +; CHECK-AARCH64: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000 + +; CHECK-ARM64: and +; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #24, #7 %oldval = load volatile i64* %existing %oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000 @@ -117,8 +133,12 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) { ; Mask is too complicated for literal ANDwwi, make sure other avenues are tried. define void @test_32bit_complexmask(i32 *%existing, i32 *%new) { ; CHECK-LABEL: test_32bit_complexmask: -; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4 -; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + +; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4 +; CHECK-AARCH64: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + +; CHECK-ARM64: and +; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3 %oldval = load volatile i32* %existing %oldval_keep = and i32 %oldval, 647 ; = 0x287 @@ -137,6 +157,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) { define void @test_32bit_badmask(i32 *%existing, i32 *%new) { ; CHECK-LABEL: test_32bit_badmask: ; CHECK-NOT: bfi +; CHECK-NOT: bfm ; CHECK: ret %oldval = load volatile i32* %existing @@ -156,6 +177,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) { define void @test_64bit_badmask(i64 *%existing, i64 *%new) { ; CHECK-LABEL: test_64bit_badmask: ; CHECK-NOT: bfi +; CHECK-NOT: bfm ; CHECK: ret %oldval = load volatile i64* %existing @@ -186,8 +208,8 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) { %combined = or i32 %oldval_keep, %newval_masked store volatile i32 %combined, i32* %existing ; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14 -; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5 +; CHECK-AARCH64: bfi {{w[0-9]+}}, [[BIT]], #26, #5 +; CHECK-ARM64: bfm {{w[0-9]+}}, [[BIT]], #6, #4 ret void } - diff --git a/test/CodeGen/ARM64/strict-align.ll b/test/CodeGen/ARM64/strict-align.ll index e3921723866..bb42780a858 100644 --- a/test/CodeGen/ARM64/strict-align.ll +++ b/test/CodeGen/ARM64/strict-align.ll @@ -4,7 +4,7 @@ define i32 @f0(i32* nocapture %p) nounwind { ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2] ; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0] -; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16 +; CHECK-STRICT: bfm [[LOW]], [[HIGH]], #16, #15 ; CHECK-STRICT: ret ; CHECK: ldr w0, [x0] @@ -15,7 +15,7 @@ define i32 @f0(i32* nocapture %p) nounwind { define i64 @f1(i64* nocapture %p) nounwind { ; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0] -; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32 +; CHECK-STRICT: bfm x[[LOW]], x[[HIGH]], #32, #31 ; CHECK-STRICT: ret ; CHECK: ldr x0, [x0]