From a05d8be1a2048529fd63aca98acc609f25890c74 Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed.bougacha@gmail.com>
Date: Tue, 4 Aug 2015 00:42:34 +0000
Subject: [PATCH] [AArch64] Vector FCOPYSIGN supports Custom-lowering: mark it
 as such.

There's a bunch of code in LowerFCOPYSIGN that does smart lowering, and
is actually already vector-aware; let's use it instead of scalarizing!

The only interesting change is that for v2f32, we previously always used
use v4i32 as the integer vector type.
Use v2i32 instead, and mark FCOPYSIGN as Custom.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243926 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp |  5 +-
 test/CodeGen/AArch64/vector-fcopysign.ll   | 58 +++++-----------------
 2 files changed, 17 insertions(+), 46 deletions(-)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index a0d7c640729..99239c6fd0d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -632,6 +632,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
     setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+
+    // But we do support custom-lowering for FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
   }
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
@@ -3651,7 +3654,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
   SDValue VecVal1, VecVal2;
   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
     EltVT = MVT::i32;
-    VecVT = MVT::v4i32;
+    VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
     EltMask = 0x80000000ULL;
 
     if (!VT.isVector()) {
diff --git a/test/CodeGen/AArch64/vector-fcopysign.ll b/test/CodeGen/AArch64/vector-fcopysign.ll
index 504e47664f8..f3e2827ce65 100644
--- a/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -8,12 +8,8 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 define <1 x float> @test_copysign_v1f32_v1f32(<1 x float> %a, <1 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_v1f32_v1f32:
 ; CHECK:       ; BB#0:
-; CHECK-NEXT:    mov s2, v1[1]
-; CHECK-NEXT:    mov s3, v0[1]
-; CHECK-NEXT:    movi.4s v4, #0x80, lsl #24
-; CHECK-NEXT:    bit.16b v3, v2, v4
-; CHECK-NEXT:    bit.16b v0, v1, v4
-; CHECK-NEXT:    ins.s v0[1], v3[0]
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
 ; CHECK-NEXT:    ret
   %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b)
   ret <1 x float> %r
@@ -68,12 +64,8 @@ declare <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) #0
 define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_v2f32_v2f32:
 ; CHECK:       ; BB#0:
-; CHECK-NEXT:    mov s2, v1[1]
-; CHECK-NEXT:    mov s3, v0[1]
-; CHECK-NEXT:    movi.4s v4, #0x80, lsl #24
-; CHECK-NEXT:    bit.16b v3, v2, v4
-; CHECK-NEXT:    bit.16b v0, v1, v4
-; CHECK-NEXT:    ins.s v0[1], v3[0]
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
 ; CHECK-NEXT:    ret
   %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %r
@@ -103,20 +95,8 @@ declare <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) #0
 define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_v4f32_v4f32:
 ; CHECK:       ; BB#0:
-; CHECK-NEXT:    mov s2, v1[1]
-; CHECK-NEXT:    mov s3, v0[1]
-; CHECK-NEXT:    movi.4s v4, #0x80, lsl #24
-; CHECK-NEXT:    mov s5, v0[2]
-; CHECK-NEXT:    bit.16b v3, v2, v4
-; CHECK-NEXT:    mov s2, v0[3]
-; CHECK-NEXT:    mov s6, v1[2]
-; CHECK-NEXT:    bit.16b v0, v1, v4
-; CHECK-NEXT:    bit.16b v5, v6, v4
-; CHECK-NEXT:    mov s1, v1[3]
-; CHECK-NEXT:    ins.s v0[1], v3[0]
-; CHECK-NEXT:    ins.s v0[2], v5[0]
-; CHECK-NEXT:    bit.16b v2, v1, v4
-; CHECK-NEXT:    ins.s v0[3], v2[0]
+; CHECK-NEXT:    movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.16b v0, v1, v2
 ; CHECK-NEXT:    ret
   %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %r
@@ -174,13 +154,9 @@ define <2 x double> @test_copysign_v2f64_v232(<2 x double> %a, <2 x float> %b) #
 define <2 x double> @test_copysign_v2f64_v2f64(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_copysign_v2f64_v2f64:
 ; CHECK:       ; BB#0:
-; CHECK-NEXT:    mov d2, v1[1]
-; CHECK-NEXT:    mov d3, v0[1]
-; CHECK-NEXT:    movi.2d v4, #0000000000000000
-; CHECK-NEXT:    fneg.2d v4, v4
-; CHECK-NEXT:    bit.16b v3, v2, v4
-; CHECK-NEXT:    bit.16b v0, v1, v4
-; CHECK-NEXT:    ins.d v0[1], v3[0]
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    bit.16b v0, v1, v2
 ; CHECK-NEXT:    ret
   %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
   ret <2 x double> %r
@@ -221,18 +197,10 @@ define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b)
 define <4 x double> @test_copysign_v4f64_v4f64(<4 x double> %a, <4 x double> %b) #0 {
 ; CHECK-LABEL: test_copysign_v4f64_v4f64:
 ; CHECK:       ; BB#0:
-; CHECK-NEXT:    mov d4, v2[1]
-; CHECK-NEXT:    mov d5, v0[1]
-; CHECK-NEXT:    movi.2d v6, #0000000000000000
-; CHECK-NEXT:    fneg.2d v6, v6
-; CHECK-NEXT:    bit.16b v5, v4, v6
-; CHECK-NEXT:    mov d4, v3[1]
-; CHECK-NEXT:    bit.16b v0, v2, v6
-; CHECK-NEXT:    mov d2, v1[1]
-; CHECK-NEXT:    bit.16b v2, v4, v6
-; CHECK-NEXT:    bit.16b v1, v3, v6
-; CHECK-NEXT:    ins.d v0[1], v5[0]
-; CHECK-NEXT:    ins.d v1[1], v2[0]
+; CHECK-NEXT:    movi.2d v4, #0000000000000000
+; CHECK-NEXT:    fneg.2d v4, v4
+; CHECK-NEXT:    bit.16b v0, v2, v4
+; CHECK-NEXT:    bit.16b v1, v3, v4
 ; CHECK-NEXT:    ret
   %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
   ret <4 x double> %r
-- 
2.34.1