test/CodeGen/AArch64/arm64-neon-v8.1a.ll

   1 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
   2 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
   3 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
   4
   5 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
   6 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
   7 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
   8 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
   9 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
  10 declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
  11
  12 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
  13 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
  14 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
  15 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
  16 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
  17 declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
  18
  19 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
  20 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
  21 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
  22 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
  23 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
  24 declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
  25
  26 ;-----------------------------------------------------------------------------
  27 ; RDMA Vector
  28 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS
  29
  30 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
  31 ; CHECK-LABEL: test_sqrdmlah_v4i16:
  32    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
  33    %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
  34 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
  35 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
  36 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
  37    ret <4 x i16> %retval
  38 }
  39
  40 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
  41 ; CHECK-LABEL: test_sqrdmlah_v8i16:
  42    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
  43    %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
  44 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
  45 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
  46 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
  47    ret <8 x i16> %retval
  48 }
  49
  50 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
  51 ; CHECK-LABEL: test_sqrdmlah_v2i32:
  52    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
  53    %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
  54 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
  55 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
  56 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
  57    ret <2 x i32> %retval
  58 }
  59
  60 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
  61 ; CHECK-LABEL: test_sqrdmlah_v4i32:
  62    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
  63    %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
  64 ; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
  65 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
  66 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
  67    ret <4 x i32> %retval
  68 }
  69
  70 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
  71 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
  72    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
  73    %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
  74 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
  75 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
  76 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
  77    ret <4 x i16> %retval
  78 }
  79
  80 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
  81 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
  82    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
  83    %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
  84 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
  85 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
  86 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
  87    ret <8 x i16> %retval
  88 }
  89
  90 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
  91 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
  92    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
  93    %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
  94 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
  95 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
  96 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
  97    ret <2 x i32> %retval
  98 }
  99
 100 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
 101 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
 102    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
 103    %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 104 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
 105 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
 106 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
 107    ret <4 x i32> %retval
 108 }
 109
 110 ;-----------------------------------------------------------------------------
 111 ; RDMA Vector, by element
 112 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
 113
 114 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
 115 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
 116 entry:
 117   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 118   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 119   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
 120 ; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
 121 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
 122 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
 123   ret <4 x i16> %retval
 124 }
 125
 126 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
 127 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
 128 entry:
 129   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
 130   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 131   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 132 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
 133 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
 134 ; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
 135   ret <8 x i16> %retval
 136 }
 137
 138 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
 139 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
 140 entry:
 141   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 142   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 143   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 144 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
 145 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
 146 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
 147   ret <2 x i32> %retval
 148 }
 149
 150 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
 151 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
 152 entry:
 153   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 154   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 155   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 156 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
 157 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
 158 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
 159   ret <4 x i32> %retval
 160 }
 161
 162 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
 163 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
 164 entry:
 165   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 166   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 167   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
 168 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
 169 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
 170 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
 171   ret <4 x i16> %retval
 172 }
 173
 174 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
 175 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
 176 entry:
 177   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
 178   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 179   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 180 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
 181 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
 182 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
 183   ret <8 x i16> %retval
 184 }
 185
 186 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
 187 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
 188 entry:
 189   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 190   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 191   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 192 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
 193 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
 194 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
 195   ret <2 x i32> %retval
 196 }
 197
 198 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
 199 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
 200 entry:
 201   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 202   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 203   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 204 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
 205 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
 206 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
 207   ret <4 x i32> %retval
 208 }
 209
 210 ;-----------------------------------------------------------------------------
 211 ; RDMA Vector, by element, extracted
 212 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
 213 ; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
 214
 215 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
 216 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
 217 entry:
 218   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
 219   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 220   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 221   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
 222   %retval = extractelement <4 x i16> %retval_vec, i64 0
 223 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
 224 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
 225 ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
 226   ret i16 %retval
 227 }
 228
 229 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
 230 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
 231 entry:
 232   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
 233   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 234   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
 235   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
 236   %retval = extractelement <8 x i16> %retval_vec, i64 0
 237 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
 238 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
 239 ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
 240   ret i16 %retval
 241 }
 242
 243 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
 244 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
 245 entry:
 246   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
 247   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 248   %extract = extractelement <2 x i32> %prod, i64 0
 249   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
 250 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
 251 ; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
 252 ; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
 253   ret i32 %retval
 254 }
 255
 256 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
 257 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
 258 entry:
 259   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 260   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 261   %extract = extractelement <4 x i32> %prod, i64 0
 262   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
 263 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
 264 ; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
 265 ; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
 266   ret i32 %retval
 267 }
 268
 269 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
 270 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
 271 entry:
 272   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
 273   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 274   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 275   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
 276   %retval = extractelement <4 x i16> %retval_vec, i64 0
 277 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
 278 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
 279 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
 280   ret i16 %retval
 281 }
 282
 283 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
 284 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
 285 entry:
 286   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
 287   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 288   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
 289   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
 290   %retval = extractelement <8 x i16> %retval_vec, i64 0
 291 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
 292 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
 293 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
 294   ret i16 %retval
 295 }
 296
 297 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
 298 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
 299 entry:
 300   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
 301   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 302   %extract = extractelement <2 x i32> %prod, i64 0
 303   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
 304 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
 305 ; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
 306 ; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
 307   ret i32 %retval
 308 }
 309
 310 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
 311 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
 312 entry:
 313   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 314   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 315   %extract = extractelement <4 x i32> %prod, i64 0
 316   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
 317 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
 318 ; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
 319 ; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
 320   ret i32 %retval
 321 }
 322
 323 ;-----------------------------------------------------------------------------
 324 ; RDMA Scalar
 325 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
 326
 327 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
 328 ; CHECK-LABEL: test_sqrdmlah_v1i16:
 329   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
 330   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
 331   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
 332   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 333   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
 334   %retval = extractelement <4 x i16> %retval_vec, i64 0
 335 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 336 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 337 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 338   ret i16 %retval
 339 }
 340
 341 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
 342 ; CHECK-LABEL: test_sqrdmlah_v1i32:
 343   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
 344   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
 345   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
 346   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
 347   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
 348   %retval = extractelement <4 x i32> %retval_vec, i64 0
 349 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 350 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 351 ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 352   ret i32 %retval
 353 }
 354
 355
 356 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
 357 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
 358   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
 359   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
 360   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
 361   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 362   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
 363   %retval = extractelement <4 x i16> %retval_vec, i64 0
 364 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 365 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 366 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 367   ret i16 %retval
 368 }
 369
 370 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
 371 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
 372   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
 373   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
 374   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
 375   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
 376   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
 377   %retval = extractelement <4 x i32> %retval_vec, i64 0
 378 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 379 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 380 ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 381   ret i32 %retval
 382 }
 383 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
 384 ; CHECK-LABEL: test_sqrdmlah_i32:
 385   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
 386   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
 387 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 388 ; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 389 ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 390   ret i32 %retval
 391 }
 392
 393 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
 394 ; CHECK-LABEL: test_sqrdmlsh_i32:
 395   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
 396   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
 397 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 398 ; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 399 ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 400   ret i32 %retval
 401 }
 402
 403 ;-----------------------------------------------------------------------------
 404 ; RDMA Scalar, by element
 405 ; i16 tests are performed via tests in above chapter, with IR in ACLE style
 406 ; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
 407
 408 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
 409 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
 410   %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
 411   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
 412   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
 413   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 414   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
 415   %retval = extractelement <4 x i16> %retval_vec, i32 0
 416 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
 417 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
 418 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
 419   ret i16 %retval
 420 }
 421
 422 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 423 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
 424   %extract = extractelement <4 x i32> %rhs, i32 3
 425   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
 426   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
 427 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 428 ; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 429 ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
 430   ret i32 %retval
 431 }
 432
 433 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
 434 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
 435   %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
 436   %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
 437   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
 438   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
 439   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
 440   %retval = extractelement <8 x i16> %retval_vec, i32 0
 441 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
 442 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
 443 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
 444   ret i16 %retval
 445 }
 446
 447 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 448 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
 449   %extract = extractelement <4 x i32> %rhs, i32 3
 450   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
 451   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
 452 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 453 ; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 454 ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
 455   ret i32 %retval
 456 }