From 2a0e97431ecef2aa6a24a16ced207d5b53fcfc2d Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Sat, 27 Nov 2010 06:35:16 +0000 Subject: [PATCH] Add NEON VLD1-dup instructions (load 1 element to all lanes). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120194 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMExpandPseudoInsts.cpp | 13 ++++ lib/Target/ARM/ARMInstrNEON.td | 76 ++++++++++++++++++++- lib/Target/ARM/ARMSchedule.td | 2 + lib/Target/ARM/ARMScheduleA8.td | 12 ++++ lib/Target/ARM/ARMScheduleA9.td | 18 +++++ test/CodeGen/ARM/2009-11-02-NegativeLane.ll | 3 +- test/CodeGen/ARM/vlddup.ll | 41 +++++++++++ 7 files changed, 162 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/ARM/vlddup.ll diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index f593f02d3ab..34f602bfda6 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -112,6 +112,13 @@ namespace { } static const NEONLdStTableEntry NEONLdStTable[] = { +{ ARM::VLD1DUPq16Pseudo, ARM::VLD1DUPq16, true, false, SingleSpc, 2, 4}, +{ ARM::VLD1DUPq16Pseudo_UPD, ARM::VLD1DUPq16_UPD, true, true, SingleSpc, 2, 4}, +{ ARM::VLD1DUPq32Pseudo, ARM::VLD1DUPq32, true, false, SingleSpc, 2, 2}, +{ ARM::VLD1DUPq32Pseudo_UPD, ARM::VLD1DUPq32_UPD, true, true, SingleSpc, 2, 2}, +{ ARM::VLD1DUPq8Pseudo, ARM::VLD1DUPq8, true, false, SingleSpc, 2, 8}, +{ ARM::VLD1DUPq8Pseudo_UPD, ARM::VLD1DUPq8_UPD, true, true, SingleSpc, 2, 8}, + { ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, EvenDblSpc, 1, 4 }, { ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, EvenDblSpc, 1, 4 }, { ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, EvenDblSpc, 1, 2 }, @@ -920,6 +927,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { case ARM::VLD4q8oddPseudo_UPD: case ARM::VLD4q16oddPseudo_UPD: case ARM::VLD4q32oddPseudo_UPD: + case ARM::VLD1DUPq8Pseudo: + case ARM::VLD1DUPq16Pseudo: + case ARM::VLD1DUPq32Pseudo: + case ARM::VLD1DUPq8Pseudo_UPD: + case ARM::VLD1DUPq16Pseudo_UPD: + case ARM::VLD1DUPq32Pseudo_UPD: ExpandVLD(MBBI); break; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index bf02b78c929..22716e3477c 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -162,8 +162,6 @@ def VSTMQDB IIC_fpStore_m, "", [(store (v2f64 QPR:$src), GPR:$Rn)]>; -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { - // Classes for VLD* pseudo-instructions with multi-register operands. // These are expanded to real instructions after register allocation. class VLDQPseudo @@ -183,6 +181,8 @@ class VLDQQQQWBPseudo (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, "$addr.addr = $wb, $src = $dst">; +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + // VLD1 : Vector Load (multiple single elements) class VLD1D op7_4, string Dt> : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd), @@ -790,7 +790,79 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> { def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo; def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo; +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + // VLD1DUP : Vector Load (single element to all lanes) +class VLD1DUP op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), (ins addrmode6:$Rn), + IIC_VLD1dup, "vld1", Dt, "\\{$Vd[]\\}, $Rn", "", + [(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6:$Rn)))))]> { + let Rm = 0b1111; +} +class VLD1QDUPPseudo : VLDQPseudo { + let Pattern = [(set QPR:$dst, + (Ty (NEONvdup (i32 (LoadOp addrmode6:$addr)))))]; +} + +def VLD1DUPd8 : VLD1DUP<0b1100, {0,0,0,?}, "8", v8i8, extloadi8> { + let Inst{4} = Rn{4}; +} +def VLD1DUPd16 : VLD1DUP<0b1100, {0,1,0,?}, "16", v4i16, extloadi16> { + let Inst{4} = Rn{4}; +} +def VLD1DUPd32 : VLD1DUP<0b1100, {1,0,0,?}, "32", v2i32, load> { + let Inst{4} = Rn{4}; +} + +def VLD1DUPq8Pseudo : VLD1QDUPPseudo; +def VLD1DUPq16Pseudo : VLD1QDUPPseudo; +def VLD1DUPq32Pseudo : VLD1QDUPPseudo; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +class VLD1QDUP op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn), IIC_VLD1dup, + "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> { + let Rm = 0b1111; +} + +def VLD1DUPq8 : VLD1QDUP<0b1100, {0,0,1,0}, "8", v16i8, extloadi8>; +def VLD1DUPq16 : VLD1QDUP<0b1100, {0,1,1,?}, "16", v8i16, extloadi16> { + let Inst{4} = Rn{4}; +} +def VLD1DUPq32 : VLD1QDUP<0b1100, {1,0,1,?}, "32", v4i32, load> { + let Inst{4} = Rn{4}; +} + +// ...with address register writeback: +class VLD1DUPWB op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1dupu, + "vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []>; +class VLD1QDUPWB op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1dupu, + "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []>; + +def VLD1DUPd8_UPD : VLD1DUPWB<0b1100, {0,0,0,0}, "8">; +def VLD1DUPd16_UPD : VLD1DUPWB<0b1100, {0,1,0,?}, "16"> { let Inst{4} = Rn{4}; } +def VLD1DUPd32_UPD : VLD1DUPWB<0b1100, {1,0,0,?}, "32"> { let Inst{4} = Rn{4}; } + +def VLD1DUPq8_UPD : VLD1QDUPWB<0b1100, {0,0,1,0}, "8">; +def VLD1DUPq16_UPD : VLD1QDUPWB<0b1100, {0,1,1,?}, "16"> { + let Inst{4} = Rn{4}; +} +def VLD1DUPq32_UPD : VLD1QDUPWB<0b1100, {1,0,1,?}, "32"> { + let Inst{4} = Rn{4}; +} + +def VLD1DUPq8Pseudo_UPD : VLDQWBPseudo; +def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo; +def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo; + // VLD2DUP : Vector Load (single 2-element structure to all lanes) // VLD3DUP : Vector Load (single 3-element structure to all lanes) // VLD4DUP : Vector Load (single 4-element structure to all lanes) diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 2cdf3a4ddc0..acef2f6bdbc 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -138,6 +138,8 @@ def IIC_VLD1x3u : InstrItinClass; def IIC_VLD1x4u : InstrItinClass; def IIC_VLD1ln : InstrItinClass; def IIC_VLD1lnu : InstrItinClass; +def IIC_VLD1dup : InstrItinClass; +def IIC_VLD1dupu : InstrItinClass; def IIC_VLD2 : InstrItinClass; def IIC_VLD2x2 : InstrItinClass; def IIC_VLD2u : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index bcd6c78ae9c..44756b7a068 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -475,6 +475,18 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<3, [A8_LSPipe]>], [3, 2, 1, 1, 1, 1]>, // + // VLD1dup + InstrItinData, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1, 1]>, + // // VLD2 InstrItinData, InstrStage<2, [A8_NLSPipe], 0>, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 14bfdb7578f..0f4e79aeee3 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -813,6 +813,24 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<3, [A9_LSUnit]>], [4, 2, 1, 1, 1, 1]>, // + // VLD1dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1]>, + // + // VLD1dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1]>, + // // VLD2 InstrItinData, InstrStage<1, [A9_MUX0], 0>, diff --git a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll index 89c9037bd9f..ca5ae8b62e8 100644 --- a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll +++ b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll @@ -1,4 +1,4 @@ -; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.16 +; RUN: llc -mcpu=cortex-a8 < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" target triple = "armv7-eabi" @@ -7,6 +7,7 @@ entry: br i1 undef, label %return, label %bb bb: ; preds = %bb, %entry +; CHECK: vld1.16 {d16[], d17[]} %0 = load i16* undef, align 2 %1 = insertelement <8 x i16> undef, i16 %0, i32 2 %2 = insertelement <8 x i16> %1, i16 undef, i32 3 diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll new file mode 100644 index 00000000000..f172ed56d96 --- /dev/null +++ b/test/CodeGen/ARM/vlddup.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s + +define <8 x i8> @vld1dupi8(i8* %A) nounwind { +;CHECK: vld1dupi8: +;Check the (default) alignment value. +;CHECK: vld1.8 {d16[]}, [r0] + %tmp1 = load i8* %A, align 8 + %tmp2 = insertelement <8 x i8> undef, i8 %tmp1, i32 0 + %tmp3 = shufflevector <8 x i8> %tmp2, <8 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vld1dupi16(i16* %A) nounwind { +;CHECK: vld1dupi16: +;Check the alignment value. Max for this instruction is 16 bits: +;CHECK: vld1.16 {d16[]}, [r0, :16] + %tmp1 = load i16* %A, align 8 + %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0 + %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vld1dupi32(i32* %A) nounwind { +;CHECK: vld1dupi32: +;Check the alignment value. Max for this instruction is 32 bits: +;CHECK: vld1.32 {d16[]}, [r0, :32] + %tmp1 = load i32* %A, align 8 + %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0 + %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vld1dupQi8(i8* %A) nounwind { +;CHECK: vld1dupQi8: +;Check the (default) alignment value. +;CHECK: vld1.8 {d16[], d17[]}, [r0] + %tmp1 = load i8* %A, align 8 + %tmp2 = insertelement <16 x i8> undef, i8 %tmp1, i32 0 + %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %tmp3 +} -- 2.34.1