From: Silviu Baranga Date: Mon, 29 Jul 2013 09:25:50 +0000 (+0000) Subject: Allow generation of vmla.f32 instructions when targeting Cortex-A15. The patch also... X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=fd79485dfa4fee67467299720efac2d0c21d846c;p=oota-llvm.git Allow generation of vmla.f32 instructions when targeting Cortex-A15. The patch also adds the VFP4 feature to Cortex-A15 and fixes the DontUseFusedMAC predicate so that we can still generate vmla.f32 instructions on non-darwin targets with VFP4. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187349 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 46928dcb2ee..e5da3a54390 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -179,7 +179,7 @@ def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", // FIXME: It has not been determined if A15 has these features. def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", "Cortex-A15 ARM processors", - [FeatureT2XtPk, FeatureFP16, + [FeatureT2XtPk, FeatureFP16, FeatureVFP4, FeatureAvoidPartialCPSR, FeatureTrustZone]>; def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 31ce38e503c..4ca3af6b3ea 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -422,7 +422,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (!CheckVMLxHazard) return true; - if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() && + if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9() && !Subtarget->isSwift()) return true; diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index f543e5d7ee8..c2434023f8f 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -262,7 +262,9 @@ def UseMulOps : Predicate<"Subtarget->useMulOps()">; def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast) && " "!Subtarget->isTargetDarwin()">; -def DontUseFusedMAC : Predicate<"!Subtarget->hasVFP4() || " +def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast &&" + " Subtarget->hasVFP4()) || " "Subtarget->isTargetDarwin()">; // VGETLNi32 is microcoded on Swift - prefer VMOV. diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 354a7797dea..1ba78e4a984 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -169,7 +169,7 @@ bool ARMPassConfig::addPreRegAlloc() { // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only()) addPass(createARMLoadStoreOptimizationPass(true)); - if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9()) + if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9()) addPass(createMLxExpansionPass()); // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be // enabled when NEON is available. diff --git a/test/CodeGen/ARM/a15-mla.ll b/test/CodeGen/ARM/a15-mla.ll index 25f6de4762d..b233cc27c4b 100644 --- a/test/CodeGen/ARM/a15-mla.ll +++ b/test/CodeGen/ARM/a15-mla.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=arm -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp | FileCheck %s ; This test checks that the VMLxForwarting feature is disabled for A15. -; CHECK: fun_a +; CHECK: fun_a: define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{ %1 = add <4 x i32> %x, %y ; CHECK-NOT: vmul @@ -10,3 +10,27 @@ define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{ %3 = add <4 x i32> %y, %2 ret <4 x i32> %3 } + +; This tests checks that VMLA FP patterns can be matched in instruction selection when targeting +; Cortex-A15. +; CHECK: fun_b: +define <4 x float> @fun_b(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind{ +; CHECK: vmla.f32 + %t = fmul <4 x float> %x, %y + %r = fadd <4 x float> %t, %z + ret <4 x float> %r +} + +; This tests checks that FP VMLA instructions are not expanded into separate multiply/addition +; operations when targeting Cortex-A15. +; CHECK: fun_c: +define <4 x float> @fun_c(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %u, <4 x float> %v) nounwind{ +; CHECK: vmla.f32 + %t1 = fmul <4 x float> %x, %y + %r1 = fadd <4 x float> %t1, %z +; CHECK: vmla.f32 + %t2 = fmul <4 x float> %u, %v + %r2 = fadd <4 x float> %t2, %r1 + ret <4 x float> %r2 +} +