From ad3c6289b6552723bc80d66275b0ed0d771746a4 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Thu, 21 Aug 2014 18:10:07 +0000 Subject: [PATCH] [AArch64] Run a peephole pass right after AdvSIMD pass. The AdvSIMD pass may produce copies that are not coalescer-friendly. The peephole optimizer knows how to fix that as demonstrated in the test case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216200 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64TargetMachine.cpp | 6 ++++- test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll | 25 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index d34be77f0ff..f3172a62f03 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -195,8 +195,12 @@ bool AArch64PassConfig::addILPOpts() { bool AArch64PassConfig::addPreRegAlloc() { // Use AdvSIMD scalar instructions whenever profitable. - if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) + if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) { addPass(createAArch64AdvSIMDScalar()); + // The AdvSIMD pass may produce copies that can be rewritten to + // be register coaleascer friendly. + addPass(&PeepholeOptimizerID); + } return true; } diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll index a3d7727c8ae..6266d1cc9b3 100644 --- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll +++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll @@ -1,15 +1,36 @@ -; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NOOPT +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPT +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-NOOPT +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-OPT define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone { ; CHECK-LABEL: bar: ; CHECK: add.2d v[[REG:[0-9]+]], v0, v1 ; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1 +; Without advanced copy optimization, we end up with cross register +; banks copies that cannot be coalesced. +; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]] +; With advanced copy optimization, we end up with just one copy +; to insert the computed high part into the V register. +; CHECK-OPT-NOT: fmov ; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1 +; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]] +; CHECK-NOOPT: fmov d0, [[COPY_REG3]] +; CHECK-OPT-NOT: fmov +; CHECK: ins.d v0[1], [[COPY_REG2]] +; CHECK-NEXT: ret +; ; GENERIC-LABEL: bar: ; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d ; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1 +; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]] +; GENERIC-OPT-NOT: fmov ; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1 +; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]] +; GENERIC-NOOPT: fmov d0, [[COPY_REG3]] +; GENERIC-OPT-NOT: fmov +; GENERIC: ins v0.d[1], [[COPY_REG2]] +; GENERIC-NEXT: ret %add = add <2 x i64> %a, %b %vgetq_lane = extractelement <2 x i64> %add, i32 0 %vgetq_lane2 = extractelement <2 x i64> %b, i32 0 -- 2.34.1