From 8927f6cd0f8d7614e8682a2da08f0a9769be9603 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Mon, 3 Aug 2015 17:20:10 +0000 Subject: [PATCH] ARM: prefer allocating VFP regs at stride 4 on Darwin. This is necessary for WatchOS support, where the compact unwind format assumes this kind of layout. For now we only want this on Swift-like CPUs though, where it's been the Xcode behaviour for ages. Also, since it can expand the prologue we don't want it at -Oz. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243884 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMRegisterInfo.td | 29 +++++++++++++++-------- lib/Target/ARM/ARMSubtarget.cpp | 5 ++++ lib/Target/ARM/ARMSubtarget.h | 2 ++ test/CodeGen/ARM/fold-stack-adjust.ll | 5 +--- test/CodeGen/ARM/vfp-reg-stride.ll | 33 +++++++++++++++++++++++++++ 5 files changed, 61 insertions(+), 13 deletions(-) create mode 100644 test/CodeGen/ARM/vfp-reg-stride.ll diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index 45cc9ea91f3..7c0319a01ee 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -266,12 +266,19 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { } // Scalar single precision floating point register class.. -// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to -// avoid partial-write dependencies on D registers (S registers are -// renamed as portions of D registers). -def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate - (sequence "S%u", 0, 31), 2), - (sequence "S%u", 0, 31))>; +// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack +// to avoid partial-write dependencies on D or Q (depending on platform) +// registers (S registers are renamed as portions of D/Q registers). +def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate SPR, 2), SPR), + (add (decimate SPR, 4), + (decimate SPR, 2), + (decimate (rotl SPR, 1), 4), + (decimate (rotl SPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().useStride4VFPs(MF); + }]; +} // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations @@ -283,9 +290,13 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>; // is double-word alignment though. def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, (sequence "D%u", 0, 31)> { - // Allocate non-VFP2 registers D16-D31 first. - let AltOrders = [(rotl DPR, 16)]; - let AltOrderSelect = [{ return 1; }]; + // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on + // Darwin platforms. + let AltOrders = [(rotl DPR, 16), + (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().useStride4VFPs(MF); + }]; } // Subset of DPR that are accessible with VFP2 (and so that also have diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 4c6e69654d5..b91e9ae650c 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -167,6 +167,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { ArchFS = FS; } ParseSubtargetFeatures(CPUString, ArchFS); + printf("A-class: %d\n", static_cast(getFeatureBits()[ARM::ProcSwift])); // FIXME: This used enable V6T2 support implicitly for Thumb2 mode. // Assert this for now to make the change obvious. @@ -285,6 +286,10 @@ bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier() && !isThumb1Only(); } +bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const { + return isSwift() && !MF.getFunction()->hasFnAttribute(Attribute::MinSize); +} + bool ARMSubtarget::useMovt(const MachineFunction &MF) const { // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit // immediates as it is inherently position independent, and may be out of diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index d6d3d83c87e..e95096146b5 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -413,6 +413,8 @@ public: return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9; } + bool useStride4VFPs(const MachineFunction &MF) const; + bool useMovt(const MachineFunction &MF) const; bool supportsTailCall() const { return SupportsTailCall; } diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll index b62d0dfee07..99174213562 100644 --- a/test/CodeGen/ARM/fold-stack-adjust.ll +++ b/test/CodeGen/ARM/fold-stack-adjust.ll @@ -60,8 +60,6 @@ define void @check_vfp_fold() minsize { ; CHECK: vpush {d6, d7, d8, d9} ; CHECK-NOT: sub sp, ; ... -; CHECK: vldmia r[[GLOBREG]], {d8, d9} -; ... ; CHECK-NOT: add sp, ; CHECK: vpop {d6, d7, d8, d9} ; CHECKL pop {r[[GLOBREG]], pc} @@ -82,9 +80,8 @@ define void @check_vfp_fold() minsize { %var = alloca i8, i32 16 - %tmp = load %bigVec, %bigVec* @var + call void asm "", "r,~{d8},~{d9}"(i8* %var) call void @bar(i8* %var) - store %bigVec %tmp, %bigVec* @var ret void } diff --git a/test/CodeGen/ARM/vfp-reg-stride.ll b/test/CodeGen/ARM/vfp-reg-stride.ll new file mode 100644 index 00000000000..5484cc810b0 --- /dev/null +++ b/test/CodeGen/ARM/vfp-reg-stride.ll @@ -0,0 +1,33 @@ +; RUN: llc -mcpu=swift -mtriple=thumbv7s-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-STRIDE4 +; RUN: llc -mcpu=cortex-a57 -mtriple=thumbv7-linux-gnueabihf -o - %s | FileCheck %s --check-prefix=CHECK-GENERIC + +define void @test_reg_stride(double %a, double %b) { +; CHECK-STRIDE4-LABEL: test_reg_stride: +; CHECK-STRIDE4-DAG: vmov d16, r +; CHECK-STRIDE4-DAG: vmov d18, r + +; CHECK-GENERIC-LABEL: test_reg_stride: +; CHECK-GENERIC-DAG: vmov.f64 d16, {{d[01]}} +; CHECK-GENERIC-DAG: vmov.f64 d17, {{d[01]}} + + call void asm "", "~{r0},~{r1},~{d0},~{d1}"() + call arm_aapcs_vfpcc void @eat_doubles(double %a, double %b) + ret void +} + +define void @test_stride_minsize(float %a, float %b) minsize { +; CHECK-STRIDE4-LABEL: test_stride_minsize: +; CHECK-STRIDE4: vmov d2, {{r[01]}} +; CHECK-STRIDE4: vmov d3, {{r[01]}} + +; CHECK-GENERIC-LABEL: test_stride_minsize: +; CHECK-GENERIC-DAG: vmov.f32 s4, {{s[01]}} +; CHECK-GENERIC-DAG: vmov.f32 s6, {{s[01]}} + call void asm "", "~{r0},~{r1},~{s0},~{s1},~{d0},~{d1}"() + call arm_aapcs_vfpcc void @eat_floats(float %a, float %b) + ret void +} + + +declare arm_aapcs_vfpcc void @eat_doubles(double, double) +declare arm_aapcs_vfpcc void @eat_floats(float, float) -- 2.34.1