From: Vladimir Sukharev Date: Tue, 31 Mar 2015 13:15:48 +0000 (+0000) Subject: [AArch64] Add v8.1a "Rounding Double Multiply Add/Subtract" extension X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=e99524cf5221d0463301a59d14b04df82e3aabd8;p=oota-llvm.git [AArch64] Add v8.1a "Rounding Double Multiply Add/Subtract" extension Reviewers: t.p.northover, jmolloy Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D8502 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233693 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index a91cf3bebb9..555b5006eba 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -5300,6 +5300,27 @@ class BaseSIMDThreeScalar size, bits<5> opcode, let Inst{4-0} = Rd; } +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDThreeScalarTied size, bit R, bits<5> opcode, + dag oops, dag iops, string asm, + list pattern> + : I, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = R; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + multiclass SIMDThreeScalarD opc, string asm, SDPatternOperator OpNode> { def v1i64 : BaseSIMDThreeScalar opc, string asm, def v1i16 : BaseSIMDThreeScalar; } +multiclass SIMDThreeScalarHSTied opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v1i32: BaseSIMDThreeScalarTied; + def v1i16: BaseSIMDThreeScalarTied; +} + multiclass SIMDThreeScalarSD opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { @@ -8518,6 +8549,174 @@ multiclass SIMDLdSt4SingleAliases { } } // end of 'let Predicates = [HasNEON]' +//---------------------------------------------------------------------------- +// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract +//---------------------------------------------------------------------------- + +let Predicates = [HasNEON, HasV8_1a] in { + +class BaseSIMDThreeSameVectorTiedR0 size, bits<5> opcode, + RegisterOperand regtype, string asm, + string kind, list pattern> + : BaseSIMDThreeSameVectorTied { + let Inst{21}=0; +} +multiclass SIMDThreeSameVectorSQRDMLxHTiedHS opc, string asm, + SDPatternOperator Accum> { + def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h", + [(set (v4i16 V64:$dst), + (Accum (v4i16 V64:$Rd), + (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn), + (v4i16 V64:$Rm)))))]>; + def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h", + [(set (v8i16 V128:$dst), + (Accum (v8i16 V128:$Rd), + (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn), + (v8i16 V128:$Rm)))))]>; + def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s", + [(set (v2i32 V64:$dst), + (Accum (v2i32 V64:$Rd), + (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), + (v2i32 V64:$Rm)))))]>; + def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), + (v4i32 V128:$Rm)))))]>; +} + +multiclass SIMDIndexedSQRDMLxHSDTied opc, string asm, + SDPatternOperator Accum> { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, + V64, V64, V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$dst), + (Accum (v4i16 V64:$Rd), + (v4i16 (int_aarch64_neon_sqrdmulh + (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$dst), + (Accum (v8i16 V128:$Rd), + (v8i16 (int_aarch64_neon_sqrdmulh + (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V64, V64, V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$dst), + (Accum (v2i32 V64:$Rd), + (v2i32 (int_aarch64_neon_sqrdmulh + (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + // FIXME: it would be nice to use the scalar (v1i32) instruction here, but + // an intermediate EXTRACT_SUBREG would be untyped. + // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we + // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..))) + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract + (v4i32 (insert_subvector + (undef), + (v2i32 (int_aarch64_neon_sqrdmulh + (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 + (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (i32 0))), + (i64 0))))), + (EXTRACT_SUBREG + (v2i32 (!cast(NAME # v2i32_indexed) + (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), + FPR32Op:$Rd, + ssub)), + V64:$Rn, + V128:$Rm, + VectorIndexS:$idx)), + ssub)>; + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqrdmulh + (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + // FIXME: it would be nice to use the scalar (v1i32) instruction here, but + // an intermediate EXTRACT_SUBREG would be untyped. + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract + (v4i32 (int_aarch64_neon_sqrdmulh + (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 + (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (i64 0))))), + (EXTRACT_SUBREG + (v4i32 (!cast(NAME # v4i32_indexed) + (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + FPR32Op:$Rd, + ssub)), + V128:$Rn, + V128:$Rm, + VectorIndexS:$idx)), + ssub)>; + + def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, + FPR16Op, FPR16Op, V128_lo, + VectorIndexH, asm, ".h", "", "", ".h", + []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (i32 FPR32Op:$dst), + (Accum (i32 FPR32Op:$Rd), + (i32 (int_aarch64_neon_sqrdmulh + (i32 FPR32Op:$Rn), + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} +} // let Predicates = [HasNeon, HasV8_1a] + //---------------------------------------------------------------------------- // Crypto extensions //---------------------------------------------------------------------------- diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index ec6fa5cfa6a..25b9b2becd7 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -2778,6 +2778,10 @@ defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; +defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", + int_aarch64_neon_sqadd>; +defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", + int_aarch64_neon_sqsub>; defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", @@ -2994,6 +2998,20 @@ defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl> defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>; defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; +let Predicates = [HasV8_1a] in { + defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; + defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; + def : Pat<(i32 (int_aarch64_neon_sqadd + (i32 FPR32:$Rd), + (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(i32 (int_aarch64_neon_sqsub + (i32 FPR32:$Rd), + (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; +} def : InstAlias<"cmls $dst, $src1, $src2", (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; @@ -4324,6 +4342,10 @@ defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", int_aarch64_neon_sqadd>; defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", int_aarch64_neon_sqsub>; +defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah", + int_aarch64_neon_sqadd>; +defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh", + int_aarch64_neon_sqsub>; defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; diff --git a/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll new file mode 100644 index 00000000000..25bc0854606 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll @@ -0,0 +1,456 @@ +; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s --check-prefix=CHECK-V8a +; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a | FileCheck %s --check-prefix=CHECK-V81a +; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple + +declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) +declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16) + +declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) +declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16) + +declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) +declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16) + +;----------------------------------------------------------------------------- +; RDMA Vector +; test for SIMDThreeSameVectorSQRDMLxHTiedHS + +define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v4i16: + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) + %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h +; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2 + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v8i16: + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) + %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h +; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2 + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v2i32: + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) + %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s +; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2 + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v4i32: + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) + %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s +; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2 + ret <4 x i32> %retval +} + +define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v4i16: + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) + %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h +; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2 + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v8i16: + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) + %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h +; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2 + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v2i32: + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) + %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s +; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2 + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v4i32: + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) + %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s +; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2 + ret <4 x i32> %retval +} + +;----------------------------------------------------------------------------- +; RDMA Vector, by element +; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied + +define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlah_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3] +; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3] + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlahq_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2] +; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2] + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlah_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1] +; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1] + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlahq_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0] +; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0] + ret <4 x i32> %retval +} + +define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlsh_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3] +; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3] + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlshq_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2] +; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2] + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlsh_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1] +; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1] + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlshq_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0] +; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0] + ret <4 x i32> %retval +} + +;----------------------------------------------------------------------------- +; RDMA Vector, by element, extracted +; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style +; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied + +define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) + %retval = extractelement <8 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %extract = extractelement <2 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0] +; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0] + ret i32 %retval +} + +define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %extract = extractelement <4 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0] +; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0] + ret i32 %retval +} + +define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) + %retval = extractelement <8 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %extract = extractelement <2 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0] +; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0] + ret i32 %retval +} + +define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %extract = extractelement <4 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0] +; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0] + ret i32 %retval +} + +;----------------------------------------------------------------------------- +; RDMA Scalar +; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td + +define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) { +; CHECK-LABEL: test_sqrdmlah_v1i16: + %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 + %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 + %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i16 %retval +} + +define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) { +; CHECK-LABEL: test_sqrdmlah_v1i32: + %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 + %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 + %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) + %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 + %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) + %retval = extractelement <4 x i32> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i32 %retval +} + + +define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) { +; CHECK-LABEL: test_sqrdmlsh_v1i16: + %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 + %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 + %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i16 %retval +} + +define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) { +; CHECK-LABEL: test_sqrdmlsh_v1i32: + %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 + %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 + %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) + %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 + %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) + %retval = extractelement <4 x i32> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i32 %retval +} +define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) { +; CHECK-LABEL: test_sqrdmlah_i32: + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + ret i32 %retval +} + +define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) { +; CHECK-LABEL: test_sqrdmlsh_i32: + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + ret i32 %retval +} + +;----------------------------------------------------------------------------- +; RDMA Scalar, by element +; i16 tests are performed via tests in above chapter, with IR in ACLE style +; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied + +define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) { +; CHECK-LABEL: test_sqrdmlah_extract_i16: + %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> + %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) + %retval = extractelement <4 x i16> %retval_vec, i32 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] +; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] +; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] + ret i16 %retval +} + +define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlah_extract_i32: + %extract = extractelement <4 x i32> %rhs, i32 3 + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] + ret i32 %retval +} + +define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) { +; CHECK-LABEL: test_sqrdmlshq_extract_i16: + %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> + %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0 + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle) + %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) + %retval = extractelement <8 x i16> %retval_vec, i32 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] +; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] +; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] + ret i16 %retval +} + +define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_extract_i32: + %extract = extractelement <4 x i32> %rhs, i32 3 + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] + ret i32 %retval +} diff --git a/test/MC/AArch64/armv8.1a-rdma.s b/test/MC/AArch64/armv8.1a-rdma.s new file mode 100644 index 00000000000..1de2a0fb15d --- /dev/null +++ b/test/MC/AArch64/armv8.1a-rdma.s @@ -0,0 +1,154 @@ +// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a -show-encoding < %s 2> %t | FileCheck %s +// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s + .text + + //AdvSIMD RDMA vector + sqrdmlah v0.4h, v1.4h, v2.4h + sqrdmlsh v0.4h, v1.4h, v2.4h + sqrdmlah v0.2s, v1.2s, v2.2s + sqrdmlsh v0.2s, v1.2s, v2.2s + sqrdmlah v0.4s, v1.4s, v2.4s + sqrdmlsh v0.4s, v1.4s, v2.4s + sqrdmlah v0.8h, v1.8h, v2.8h + sqrdmlsh v0.8h, v1.8h, v2.8h +// CHECK: sqrdmlah v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x42,0x2e] +// CHECK: sqrdmlsh v0.4h, v1.4h, v2.4h // encoding: [0x20,0x8c,0x42,0x2e] +// CHECK: sqrdmlah v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0x82,0x2e] +// CHECK: sqrdmlsh v0.2s, v1.2s, v2.2s // encoding: [0x20,0x8c,0x82,0x2e] +// CHECK: sqrdmlah v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0x82,0x6e] +// CHECK: sqrdmlsh v0.4s, v1.4s, v2.4s // encoding: [0x20,0x8c,0x82,0x6e] +// CHECK: sqrdmlah v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x42,0x6e] +// CHECK: sqrdmlsh v0.8h, v1.8h, v2.8h // encoding: [0x20,0x8c,0x42,0x6e] + + sqrdmlah v0.2h, v1.2h, v2.2h + sqrdmlsh v0.2h, v1.2h, v2.2h + sqrdmlah v0.8s, v1.8s, v2.8s + sqrdmlsh v0.8s, v1.8s, v2.8s + sqrdmlah v0.2s, v1.4h, v2.8h + sqrdmlsh v0.4s, v1.8h, v2.2s +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlah v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlsh v0.2h, v1.2h, v2.2h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlah v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid vector kind qualifier +// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlsh v0.8s, v1.8s, v2.8s +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlah v0.2s, v1.4h, v2.8h +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlsh v0.4s, v1.8h, v2.2s +// CHECK-ERROR: ^ + + //AdvSIMD RDMA scalar + sqrdmlah h0, h1, h2 + sqrdmlsh h0, h1, h2 + sqrdmlah s0, s1, s2 + sqrdmlsh s0, s1, s2 +// CHECK: sqrdmlah h0, h1, h2 // encoding: [0x20,0x84,0x42,0x7e] +// CHECK: sqrdmlsh h0, h1, h2 // encoding: [0x20,0x8c,0x42,0x7e] +// CHECK: sqrdmlah s0, s1, s2 // encoding: [0x20,0x84,0x82,0x7e] +// CHECK: sqrdmlsh s0, s1, s2 // encoding: [0x20,0x8c,0x82,0x7e] + + //AdvSIMD RDMA vector by-element + sqrdmlah v0.4h, v1.4h, v2.h[3] + sqrdmlsh v0.4h, v1.4h, v2.h[3] + sqrdmlah v0.2s, v1.2s, v2.s[1] + sqrdmlsh v0.2s, v1.2s, v2.s[1] + sqrdmlah v0.8h, v1.8h, v2.h[3] + sqrdmlsh v0.8h, v1.8h, v2.h[3] + sqrdmlah v0.4s, v1.4s, v2.s[3] + sqrdmlsh v0.4s, v1.4s, v2.s[3] +// CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3] // encoding: [0x20,0xd0,0x72,0x2f] +// CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3] // encoding: [0x20,0xf0,0x72,0x2f] +// CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1] // encoding: [0x20,0xd0,0xa2,0x2f] +// CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1] // encoding: [0x20,0xf0,0xa2,0x2f] +// CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3] // encoding: [0x20,0xd0,0x72,0x6f] +// CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3] // encoding: [0x20,0xf0,0x72,0x6f] +// CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3] // encoding: [0x20,0xd8,0xa2,0x6f] +// CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3] // encoding: [0x20,0xf8,0xa2,0x6f] + + sqrdmlah v0.4s, v1.2s, v2.s[1] + sqrdmlsh v0.2s, v1.2d, v2.s[1] + sqrdmlah v0.8h, v1.8h, v2.s[3] + sqrdmlsh v0.8h, v1.8h, v2.h[8] +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlah v0.4s, v1.2s, v2.s[1] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlsh v0.2s, v1.2d, v2.s[1] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlah v0.8h, v1.8h, v2.s[3] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: vector lane must be an integer in range [0, 7]. +// CHECK-ERROR: sqrdmlsh v0.8h, v1.8h, v2.h[8] +// CHECK-ERROR: ^ + + //AdvSIMD RDMA scalar by-element + sqrdmlah h0, h1, v2.h[3] + sqrdmlsh h0, h1, v2.h[3] + sqrdmlah s0, s1, v2.s[3] + sqrdmlsh s0, s1, v2.s[3] +// CHECK: sqrdmlah h0, h1, v2.h[3] // encoding: [0x20,0xd0,0x72,0x7f] +// CHECK: sqrdmlsh h0, h1, v2.h[3] // encoding: [0x20,0xf0,0x72,0x7f] +// CHECK: sqrdmlah s0, s1, v2.s[3] // encoding: [0x20,0xd8,0xa2,0x7f] +// CHECK: sqrdmlsh s0, s1, v2.s[3] // encoding: [0x20,0xf8,0xa2,0x7f] + + sqrdmlah b0, h1, v2.h[3] + sqrdmlah s0, d1, v2.s[3] + sqrdmlsh h0, h1, v2.s[3] + sqrdmlsh s0, s1, v2.s[4] +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlah b0, h1, v2.h[3] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlah s0, d1, v2.s[3] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmlsh h0, h1, v2.s[3] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: vector lane must be an integer in range [0, 3]. +// CHECK-ERROR: sqrdmlsh s0, s1, v2.s[4] +// CHECK-ERROR: ^ diff --git a/test/MC/Disassembler/AArch64/armv8.1a-rdma.txt b/test/MC/Disassembler/AArch64/armv8.1a-rdma.txt new file mode 100644 index 00000000000..5e1272f66f3 --- /dev/null +++ b/test/MC/Disassembler/AArch64/armv8.1a-rdma.txt @@ -0,0 +1,129 @@ +# RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s 2>&1 | FileCheck %s + +[0x20,0x84,0x02,0x2e] # sqrdmlah v0.8b, v1.8b, v2.8b +[0x20,0x8c,0x02,0x2e] # sqrdmlsh v0.8b, v1.8b, v2.8b +[0x20,0x84,0xc2,0x2e] # sqrdmlah v0.1d, v1.1d, v2.1d +[0x20,0x8c,0xc2,0x2e] # sqrdmlsh v0.1d, v1.1d, v2.1d +[0x20,0x84,0x02,0x6e] # sqrdmlah v0.16b, v1.16b, v2.16b +[0x20,0x8c,0x02,0x6e] # sqrdmlsh v0.16b, v1.16b, v2.16b +[0x20,0x84,0xc2,0x6e] # sqrdmlah v0.2d, v1.2d, v2.2d +[0x20,0x8c,0xc2,0x6e] # sqrdmlsh v0.2d, v1.2d, v2.2d +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x84,0x02,0x2e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x8c,0x02,0x2e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x84,0xc2,0x2e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x8c,0xc2,0x2e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x84,0x02,0x6e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x8c,0x02,0x6e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x84,0xc2,0x6e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x8c,0xc2,0x6e] + +[0x20,0x84,0x02,0x7e] # sqrdmlah b0, b1, b2 +[0x20,0x8c,0x02,0x7e] # sqrdmlsh b0, b1, b2 +[0x20,0x84,0xc2,0x7e] # sqrdmlah d0, d1, d2 +[0x20,0x8c,0xc2,0x7e] # sqrdmlsh d0, d1, d2 +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x84,0x02,0x7e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x8c,0x02,0x7e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x84,0xc2,0x7e] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0x8c,0xc2,0x7e] + +[0x20,0xd0,0x32,0x2f] # sqrdmlah v0.8b, v1.8b, v2.b[3] +[0x20,0xf0,0x32,0x2f] # sqrdmlsh v0.8b, v1.8b, v2.b[3] +[0x20,0xd0,0xe2,0x2f] # sqrdmlah v0.1d, v1.1d, v2.d[1] +[0x20,0xf0,0xe2,0x2f] # sqrdmlsh v0.1d, v1.1d, v2.d[1] +[0x20,0xd0,0x32,0x6f] # sqrdmlah v0.16b, v1.16b, v2.b[3] +[0x20,0xf0,0x32,0x6f] # sqrdmlsh v0.16b, v1.16b, v2.b[3] +[0x20,0xd8,0xe2,0x6f] # sqrdmlah v0.2d, v1.2d, v2.d[3] +[0x20,0xf8,0xe2,0x6f] # sqrdmlsh v0.2d, v1.2d, v2.d[3] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xd0,0x32,0x2f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xf0,0x32,0x2f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xd0,0xe2,0x2f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xf0,0xe2,0x2f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xd0,0x32,0x6f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xf0,0x32,0x6f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xd8,0xe2,0x6f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xf8,0xe2,0x6f] + +[0x20,0xd0,0x32,0x7f] # sqrdmlah b0, b1, v2.b[3] +[0x20,0xf0,0x32,0x7f] # sqrdmlsh b0, b1, v2.b[3] +[0x20,0xd8,0xe2,0x7f] # sqrdmlah d0, d1, v2.d[3] +[0x20,0xf8,0xe2,0x7f] # sqrdmlsh d0, d1, v2.d[3] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xd0,0x32,0x7f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xf0,0x32,0x7f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xd8,0xe2,0x7f] +# CHECK: warning: invalid instruction encoding +# CHECK: [0x20,0xf8,0xe2,0x7f] + +[0x20,0x84,0x42,0x2e] +[0x20,0x8c,0x42,0x2e] +[0x20,0x84,0x82,0x2e] +[0x20,0x8c,0x82,0x2e] +[0x20,0x84,0x42,0x6e] +[0x20,0x8c,0x42,0x6e] +[0x20,0x84,0x82,0x6e] +[0x20,0x8c,0x82,0x6e] +# CHECK: sqrdmlah v0.4h, v1.4h, v2.4h +# CHECK: sqrdmlsh v0.4h, v1.4h, v2.4h +# CHECK: sqrdmlah v0.2s, v1.2s, v2.2s +# CHECK: sqrdmlsh v0.2s, v1.2s, v2.2s +# CHECK: sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK: sqrdmlsh v0.8h, v1.8h, v2.8h +# CHECK: sqrdmlah v0.4s, v1.4s, v2.4s +# CHECK: sqrdmlsh v0.4s, v1.4s, v2.4s + +[0x20,0x84,0x42,0x7e] +[0x20,0x8c,0x42,0x7e] +[0x20,0x84,0x82,0x7e] +[0x20,0x8c,0x82,0x7e] +# CHECK: sqrdmlah h0, h1, h2 +# CHECK: sqrdmlsh h0, h1, h2 +# CHECK: sqrdmlah s0, s1, s2 +# CHECK: sqrdmlsh s0, s1, s2 + +0x20,0xd0,0x72,0x2f +0x20,0xf0,0x72,0x2f +0x20,0xd0,0xa2,0x2f +0x20,0xf0,0xa2,0x2f +0x20,0xd0,0x72,0x6f +0x20,0xf0,0x72,0x6f +0x20,0xd8,0xa2,0x6f +0x20,0xf8,0xa2,0x6f +# CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3] +# CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3] + +0x20,0xd0,0x72,0x7f +0x20,0xf0,0x72,0x7f +0x20,0xd8,0xa2,0x7f +0x20,0xf8,0xa2,0x7f +# CHECK: sqrdmlah h0, h1, v2.h[3] +# CHECK: sqrdmlsh h0, h1, v2.h[3] +# CHECK: sqrdmlah s0, s1, v2.s[3] +# CHECK: sqrdmlsh s0, s1, v2.s[3]