From: Jiangning Liu Date: Wed, 15 Jan 2014 05:08:01 +0000 (+0000) Subject: For AArch64, lowering sext_inreg and generate optimized code by using SXTL. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=b6db372c96906eac67b26f50202553dab1653dbd;p=oota-llvm.git For AArch64, lowering sext_inreg and generate optimized code by using SXTL. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199296 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 30378457620..581c8935c1f 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -286,6 +286,15 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) setExceptionSelectorRegister(AArch64::X1); if (Subtarget->hasNEON()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v1i64, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v16i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Expand); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); @@ -3574,7 +3583,25 @@ static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) { return (Cnt >= 1 && Cnt <= ElementBits); } -/// Checks for immediate versions of vector shifts and lowers them. +static SDValue GenForSextInreg(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + EVT SrcVT, EVT DestVT, EVT SubRegVT, + const int *Mask, SDValue Src) { + SelectionDAG &DAG = DCI.DAG; + SDValue Bitcast + = DAG.getNode(ISD::BITCAST, SDLoc(N), SrcVT, Src); + SDValue Sext + = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), DestVT, Bitcast); + SDValue ShuffleVec + = DAG.getVectorShuffle(DestVT, SDLoc(N), Sext, DAG.getUNDEF(DestVT), Mask); + SDValue ExtractSubreg + = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), + SubRegVT, ShuffleVec, + DAG.getTargetConstant(AArch64::sub_64, MVT::i32)), 0); + return ExtractSubreg; +} + +/// Checks for vector shifts and lowers them. static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST) { @@ -3583,6 +3610,51 @@ static SDValue PerformShiftCombine(SDNode *N, if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64)) return PerformSRACombine(N, DCI); + // We're looking for an SRA/SHL pair to help generating instruction + // sshll v0.8h, v0.8b, #0 + // The instruction STXL is also the alias of this instruction. + // + // For example, for DAG like below, + // v2i32 = sra (v2i32 (shl v2i32, 16)), 16 + // we can transform it into + // v2i32 = EXTRACT_SUBREG + // (v4i32 (suffle_vector + // (v4i32 (sext (v4i16 (bitcast v2i32))), + // undef, (0, 2, u, u)), + // sub_64 + // + // With this transformation we expect to generate "SSHLL + UZIP1" + // Sometimes UZIP1 can be optimized away by combining with other context. + int64_t ShrCnt, ShlCnt; + if (N->getOpcode() == ISD::SRA + && (VT == MVT::v2i32 || VT == MVT::v4i16) + && isVShiftRImm(N->getOperand(1), VT, ShrCnt) + && N->getOperand(0).getOpcode() == ISD::SHL + && isVShiftRImm(N->getOperand(0).getOperand(1), VT, ShlCnt)) { + SDValue Src = N->getOperand(0).getOperand(0); + if (VT == MVT::v2i32 && ShrCnt == 16 && ShlCnt == 16) { + // sext_inreg(v2i32, v2i16) + // We essentially only care the Mask {0, 2, u, u} + int Mask[4] = {0, 2, 4, 6}; + return GenForSextInreg(N, DCI, MVT::v4i16, MVT::v4i32, MVT::v2i32, + Mask, Src); + } + else if (VT == MVT::v2i32 && ShrCnt == 24 && ShlCnt == 24) { + // sext_inreg(v2i16, v2i8) + // We essentially only care the Mask {0, u, 4, u, u, u, u, u, u, u, u, u} + int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v2i32, + Mask, Src); + } + else if (VT == MVT::v4i16 && ShrCnt == 8 && ShlCnt == 8) { + // sext_inreg(v4i16, v4i8) + // We essentially only care the Mask {0, 2, 4, 6, u, u, u, u, u, u, u, u} + int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v4i16, + Mask, Src); + } + } + // Nothing to be done for scalar shifts. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index f1cb122eefe..81371be0666 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -1877,6 +1877,10 @@ def UXTL2vv_16B : NeonI_ext_len_alias<"uxtl2", ".8h", ".16b", USHLLvvi_16B, VPR def UXTL2vv_8H : NeonI_ext_len_alias<"uxtl2", ".4s", ".8h", USHLLvvi_8H, VPR128, VPR128>; def UXTL2vv_4S : NeonI_ext_len_alias<"uxtl2", ".2d", ".4s", USHLLvvi_4S, VPR128, VPR128>; +def : Pat<(v8i16 (anyext (v8i8 VPR64:$Rn))), (USHLLvvi_8B VPR64:$Rn, 0)>; +def : Pat<(v4i32 (anyext (v4i16 VPR64:$Rn))), (USHLLvvi_4H VPR64:$Rn, 0)>; +def : Pat<(v2i64 (anyext (v2i32 VPR64:$Rn))), (USHLLvvi_2S VPR64:$Rn, 0)>; + // Rounding/Saturating shift class N2VShift_RQ opcode, string asmop, string T, RegisterOperand VPRC, ValueType Ty, Operand ImmTy, diff --git a/test/CodeGen/AArch64/sext_inreg.ll b/test/CodeGen/AArch64/sext_inreg.ll new file mode 100644 index 00000000000..2f76081eb09 --- /dev/null +++ b/test/CodeGen/AArch64/sext_inreg.ll @@ -0,0 +1,198 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; For formal arguments, we have the following vector type promotion, +; v2i8 is promoted to v2i32(f64) +; v2i16 is promoted to v2i32(f64) +; v4i8 is promoted to v4i16(f64) +; v8i1 is promoted to v8i16(f128) + +define <2 x i8> @test_sext_inreg_v2i8i16(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i16 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %1 = sext <2 x i8> %v1 to <2 x i16> + %2 = sext <2 x i8> %v2 to <2 x i16> + %3 = shufflevector <2 x i16> %1, <2 x i16> %2, <2 x i32> + %4 = trunc <2 x i16> %3 to <2 x i8> + ret <2 x i8> %4 +} + +define <2 x i8> @test_sext_inreg_v2i8i16_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i16_2 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %a1 = shl <2 x i32> %v1, + %a2 = ashr <2 x i32> %a1, + %b1 = shl <2 x i32> %v2, + %b2 = ashr <2 x i32> %b1, + %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> + %d = trunc <2 x i32> %c to <2 x i8> + ret <2 x i8> %d +} + +define <2 x i8> @test_sext_inreg_v2i8i32(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i32 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %1 = sext <2 x i8> %v1 to <2 x i32> + %2 = sext <2 x i8> %v2 to <2 x i32> + %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> + %4 = trunc <2 x i32> %3 to <2 x i8> + ret <2 x i8> %4 +} + +define <2 x i8> @test_sext_inreg_v2i8i64(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i64 +; CHECK: ushll v1.2d, v1.2s, #0 +; CHECK: ushll v0.2d, v0.2s, #0 +; CHECK: shl v0.2d, v0.2d, #56 +; CHECK: sshr v0.2d, v0.2d, #56 +; CHECK: shl v1.2d, v1.2d, #56 +; CHECK: sshr v1.2d, v1.2d, #56 + %1 = sext <2 x i8> %v1 to <2 x i64> + %2 = sext <2 x i8> %v2 to <2 x i64> + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + %4 = trunc <2 x i64> %3 to <2 x i8> + ret <2 x i8> %4 +} + +define <4 x i8> @test_sext_inreg_v4i8i16(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i8i16 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %1 = sext <4 x i8> %v1 to <4 x i16> + %2 = sext <4 x i8> %v2 to <4 x i16> + %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <4 x i32> + %4 = trunc <4 x i16> %3 to <4 x i8> + ret <4 x i8> %4 +} + +define <4 x i8> @test_sext_inreg_v4i8i16_2(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i8i16_2 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %a1 = shl <4 x i16> %v1, + %a2 = ashr <4 x i16> %a1, + %b1 = shl <4 x i16> %v2, + %b2 = ashr <4 x i16> %b1, + %c = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> + %d = trunc <4 x i16> %c to <4 x i8> + ret <4 x i8> %d +} + +define <4 x i8> @test_sext_inreg_v4i8i32(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i8i32 +; CHECK: ushll v1.4s, v1.4h, #0 +; CHECK: ushll v0.4s, v0.4h, #0 +; CHECK: shl v0.4s, v0.4s, #24 +; CHECK: sshr v0.4s, v0.4s, #24 +; CHECK: shl v1.4s, v1.4s, #24 +; CHECK: sshr v1.4s, v1.4s, #24 + %1 = sext <4 x i8> %v1 to <4 x i32> + %2 = sext <4 x i8> %v2 to <4 x i32> + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + %4 = trunc <4 x i32> %3 to <4 x i8> + ret <4 x i8> %4 +} + +define <8 x i8> @test_sext_inreg_v8i8i16(<8 x i8> %v1, <8 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v8i8i16 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK: sshll v1.8h, v1.8b, #0 + %1 = sext <8 x i8> %v1 to <8 x i16> + %2 = sext <8 x i8> %v2 to <8 x i16> + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + %4 = trunc <8 x i16> %3 to <8 x i8> + ret <8 x i8> %4 +} + +define <8 x i1> @test_sext_inreg_v8i1i16(<8 x i1> %v1, <8 x i1> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v8i1i16 +; CHECK: ushll v1.8h, v1.8b, #0 +; CHECK: ushll v0.8h, v0.8b, #0 +; CHECK: shl v0.8h, v0.8h, #15 +; CHECK: sshr v0.8h, v0.8h, #15 +; CHECK: shl v1.8h, v1.8h, #15 +; CHECK: sshr v1.8h, v1.8h, #15 + %1 = sext <8 x i1> %v1 to <8 x i16> + %2 = sext <8 x i1> %v2 to <8 x i16> + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + %4 = trunc <8 x i16> %3 to <8 x i1> + ret <8 x i1> %4 +} + +define <2 x i16> @test_sext_inreg_v2i16i32(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i16i32 +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v1.4s + %1 = sext <2 x i16> %v1 to <2 x i32> + %2 = sext <2 x i16> %v2 to <2 x i32> + %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> + %4 = trunc <2 x i32> %3 to <2 x i16> + ret <2 x i16> %4 +} + +define <2 x i16> @test_sext_inreg_v2i16i32_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i16i32_2 +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v1.4s + %a1 = shl <2 x i32> %v1, + %a2 = ashr <2 x i32> %a1, + %b1 = shl <2 x i32> %v2, + %b2 = ashr <2 x i32> %b1, + %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> + %d = trunc <2 x i32> %c to <2 x i16> + ret <2 x i16> %d +} + +define <2 x i16> @test_sext_inreg_v2i16i64(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i16i64 +; CHECK: ushll v1.2d, v1.2s, #0 +; CHECK: ushll v0.2d, v0.2s, #0 +; CHECK: shl v0.2d, v0.2d, #48 +; CHECK: sshr v0.2d, v0.2d, #48 +; CHECK: shl v1.2d, v1.2d, #48 +; CHECK: sshr v1.2d, v1.2d, #48 + %1 = sext <2 x i16> %v1 to <2 x i64> + %2 = sext <2 x i16> %v2 to <2 x i64> + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + %4 = trunc <2 x i64> %3 to <2 x i16> + ret <2 x i16> %4 +} + +define <4 x i16> @test_sext_inreg_v4i16i32(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i16i32 +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK: sshll v1.4s, v1.4h, #0 + %1 = sext <4 x i16> %v1 to <4 x i32> + %2 = sext <4 x i16> %v2 to <4 x i32> + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + %4 = trunc <4 x i32> %3 to <4 x i16> + ret <4 x i16> %4 +} + +define <2 x i32> @test_sext_inreg_v2i32i64(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i32i64 +; CHECK: sshll v0.2d, v0.2s, #0 +; CHECK: sshll v1.2d, v1.2s, #0 + %1 = sext <2 x i32> %v1 to <2 x i64> + %2 = sext <2 x i32> %v2 to <2 x i64> + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + %4 = trunc <2 x i64> %3 to <2 x i32> + ret <2 x i32> %4 +} +