From 9ed30bb2303dc4676af9892f780a14a019d030c6 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Mon, 10 Feb 2014 14:04:07 +0000 Subject: [PATCH] ARM: use LLVM IR to represent the vshrn operation vshrn is just the combination of a right shift and a truncate (and the limits on the immediate value actually mean the signedness of the shift doesn't matter). Using that representation allows us to get rid of an ARM-specific intrinsic, share more code with AArch64 and hopefully get better code out of the mid-end optimisers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201085 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsARM.td | 1 - lib/Target/ARM/ARMISelLowering.cpp | 5 ---- lib/Target/ARM/ARMISelLowering.h | 1 - lib/Target/ARM/ARMInstrFormats.td | 8 ++--- lib/Target/ARM/ARMInstrNEON.td | 17 ++++++++--- test/CodeGen/ARM/reg_sequence.ll | 8 +++-- test/CodeGen/ARM/vshrn.ll | 47 ++++++++++++++++++++++++------ 7 files changed, 60 insertions(+), 27 deletions(-) diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td index dd80e2e179e..8002cc49b53 100644 --- a/include/llvm/IR/IntrinsicsARM.td +++ b/include/llvm/IR/IntrinsicsARM.td @@ -289,7 +289,6 @@ def int_arm_neon_vshifts : Neon_2Arg_Intrinsic; def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic; def int_arm_neon_vshiftls : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vshiftlu : Neon_2Arg_Long_Intrinsic; -def int_arm_neon_vshiftn : Neon_2Arg_Narrow_Intrinsic; // Vector Rounding Shift. def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 031334e5e6c..73f1b8f7472 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1081,7 +1081,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VSHLLs: return "ARMISD::VSHLLs"; case ARMISD::VSHLLu: return "ARMISD::VSHLLu"; case ARMISD::VSHLLi: return "ARMISD::VSHLLi"; - case ARMISD::VSHRN: return "ARMISD::VSHRN"; case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; @@ -9717,7 +9716,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vshiftu: case Intrinsic::arm_neon_vshiftls: case Intrinsic::arm_neon_vshiftlu: - case Intrinsic::arm_neon_vshiftn: case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: case Intrinsic::arm_neon_vrshiftn: @@ -9771,7 +9769,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { break; llvm_unreachable("invalid shift count for vqshlu intrinsic"); - case Intrinsic::arm_neon_vshiftn: case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: @@ -9802,8 +9799,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ? ARMISD::VSHLLs : ARMISD::VSHLLu); break; - case Intrinsic::arm_neon_vshiftn: - VShiftOpc = ARMISD::VSHRN; break; case Intrinsic::arm_neon_vrshifts: VShiftOpc = ARMISD::VRSHRs; break; case Intrinsic::arm_neon_vrshiftu: diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index eb67b815997..fd54cbb6471 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -116,7 +116,6 @@ namespace llvm { VSHLLs, // ...left long (signed) VSHLLu, // ...left long (unsigned) VSHLLi, // ...left long (with maximum shift count) - VSHRN, // ...right narrow // Vector rounding shift by immediate: VRSHRs, // ...right (signed) diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 8e3f2c7e2bf..aafff982f3e 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -212,25 +212,25 @@ def msr_mask : Operand { // 32 imm6<5> = '1', 32 - is encoded in imm6<4:0> // 64 64 - is encoded in imm6<5:0> def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; } -def shr_imm8 : Operand { +def shr_imm8 : Operand, ImmLeaf 0 && Imm <= 8; }]> { let EncoderMethod = "getShiftRight8Imm"; let DecoderMethod = "DecodeShiftRight8Imm"; let ParserMatchClass = shr_imm8_asm_operand; } def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; } -def shr_imm16 : Operand { +def shr_imm16 : Operand, ImmLeaf 0 && Imm <= 16; }]> { let EncoderMethod = "getShiftRight16Imm"; let DecoderMethod = "DecodeShiftRight16Imm"; let ParserMatchClass = shr_imm16_asm_operand; } def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; } -def shr_imm32 : Operand { +def shr_imm32 : Operand, ImmLeaf 0 && Imm <= 32; }]> { let EncoderMethod = "getShiftRight32Imm"; let DecoderMethod = "DecodeShiftRight32Imm"; let ParserMatchClass = shr_imm32_asm_operand; } def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; } -def shr_imm64 : Operand { +def shr_imm64 : Operand, ImmLeaf 0 && Imm <= 64; }]> { let EncoderMethod = "getShiftRight64Imm"; let DecoderMethod = "DecodeShiftRight64Imm"; let ParserMatchClass = shr_imm64_asm_operand; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index b732bca84b8..a1e4159259e 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -3048,12 +3048,13 @@ class N2VLSh op11_8, bit op7, bit op6, bit op4, // Narrow shift by immediate. class N2VNSh op11_8, bit op7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode> + ValueType ResTy, ValueType OpTy, Operand ImmTy, + SDPatternOperator OpNode> : N2VImm; + (i32 ImmTy:$SIMM))))]>; // Shift right by immediate and accumulate, // both double- and quad-register. @@ -3960,7 +3961,7 @@ multiclass N2VLSh_QHS op11_8, bit op7, bit op6, // element sizes of 16, 32, 64 bits: multiclass N2VNSh_HSD op11_8, bit op7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - SDNode OpNode> { + SDPatternOperator OpNode> { def v8i8 : N2VNSh { @@ -4967,7 +4968,15 @@ def VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32", // VSHRN : Vector Shift Right and Narrow defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", - NEONvshrn>; + PatFrag<(ops node:$Rn, node:$amt), + (trunc (NEONvshrs node:$Rn, node:$amt))>>; + +def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))), + (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>; +def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))), + (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>; +def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))), + (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>; // VRSHL : Vector Rounding Shift defm VRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm, diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll index 25484f48485..b245674c3c9 100644 --- a/test/CodeGen/ARM/reg_sequence.ll +++ b/test/CodeGen/ARM/reg_sequence.ll @@ -34,9 +34,11 @@ entry: %12 = sext <4 x i16> %11 to <4 x i32> ; <<4 x i32>> [#uses=1] %13 = mul <4 x i32> %1, %9 ; <<4 x i32>> [#uses=1] %14 = mul <4 x i32> %3, %12 ; <<4 x i32>> [#uses=1] - %15 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %13, <4 x i32> ) ; <<4 x i16>> [#uses=1] - %16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> ) ; <<4 x i16>> [#uses=1] - %17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> ; <<8 x i16>> [#uses=1] + %15 = lshr <4 x i32> %13, + %trunc_15 = trunc <4 x i32> %15 to <4 x i16> + %16 = lshr <4 x i32> %14, + %trunc_16 = trunc <4 x i32> %16 to <4 x i16> + %17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> ; <<8 x i16>> [#uses=1] %18 = bitcast i16* %o_ptr to i8* ; [#uses=1] tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1) ret void diff --git a/test/CodeGen/ARM/vshrn.ll b/test/CodeGen/ARM/vshrn.ll index 40a94fee0d7..cc936be8292 100644 --- a/test/CodeGen/ARM/vshrn.ll +++ b/test/CodeGen/ARM/vshrn.ll @@ -4,29 +4,58 @@ define <8 x i8> @vshrns8(<8 x i16>* %A) nounwind { ;CHECK-LABEL: vshrns8: ;CHECK: vshrn.i16 %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) - ret <8 x i8> %tmp2 + %tmp2 = lshr <8 x i16> %tmp1, + %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8> + ret <8 x i8> %tmp3 } define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind { ;CHECK-LABEL: vshrns16: ;CHECK: vshrn.i32 %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) - ret <4 x i16> %tmp2 + %tmp2 = ashr <4 x i32> %tmp1, + %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16> + ret <4 x i16> %tmp3 } define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind { ;CHECK-LABEL: vshrns32: ;CHECK: vshrn.i64 %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) - ret <2 x i32> %tmp2 + %tmp2 = ashr <2 x i64> %tmp1, + %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32> + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vshrns8_bad(<8 x i16>* %A) nounwind { +; CHECK-LABEL: vshrns8_bad: +; CHECK: vshr.s16 +; CHECK: vmovn.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = ashr <8 x i16> %tmp1, + %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8> + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vshrns16_bad(<4 x i32>* %A) nounwind { +; CHECK-LABEL: vshrns16_bad: +; CHECK: vshr.u32 +; CHECK: vmovn.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = lshr <4 x i32> %tmp1, + %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16> + ret <4 x i16> %tmp3 } -declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone +define <2 x i32> @vshrns32_bad(<2 x i64>* %A) nounwind { +; CHECK-LABEL: vshrns32_bad: +; CHECK: vshr.u64 +; CHECK: vmovn.i64 + %tmp1 = load <2 x i64>* %A + %tmp2 = lshr <2 x i64> %tmp1, + %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32> + ret <2 x i32> %tmp3 +} define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind { ;CHECK-LABEL: vrshrns8: -- 2.34.1