1 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
2 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
3 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
5 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
6 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
7 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
8 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
9 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
10 declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
12 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
13 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
14 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
15 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
16 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
17 declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
19 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
20 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
21 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
22 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
23 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
24 declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
26 ;-----------------------------------------------------------------------------
28 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS
30 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
31 ; CHECK-LABEL: test_sqrdmlah_v4i16:
32 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
33 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
34 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
35 ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h
36 ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2
40 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
41 ; CHECK-LABEL: test_sqrdmlah_v8i16:
42 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
43 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
44 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
45 ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h
46 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
50 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
51 ; CHECK-LABEL: test_sqrdmlah_v2i32:
52 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
53 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
54 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
55 ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s
56 ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2
60 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
61 ; CHECK-LABEL: test_sqrdmlah_v4i32:
62 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
63 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
64 ; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s
65 ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s
66 ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2
70 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
71 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
72 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
73 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
74 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
75 ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h
76 ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2
80 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
81 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
82 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
83 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
84 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
85 ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h
86 ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2
90 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
91 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
92 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
93 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
94 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
95 ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s
96 ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2
100 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
101 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
102 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
103 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
104 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s
105 ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s
106 ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2
107 ret <4 x i32> %retval
110 ;-----------------------------------------------------------------------------
111 ; RDMA Vector, by element
112 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
114 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
115 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
117 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
118 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
119 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
120 ; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3]
121 ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3]
122 ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3]
123 ret <4 x i16> %retval
126 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
127 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
129 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
130 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
131 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
132 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
133 ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2]
134 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2]
135 ret <8 x i16> %retval
138 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
139 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
141 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
142 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
143 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
144 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
145 ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1]
146 ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1]
147 ret <2 x i32> %retval
150 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
151 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
153 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
154 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
155 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
156 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
157 ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0]
158 ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0]
159 ret <4 x i32> %retval
162 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
163 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
165 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
166 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
167 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
168 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3]
169 ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3]
170 ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3]
171 ret <4 x i16> %retval
174 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
175 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
177 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
178 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
179 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
180 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
181 ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2]
182 ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2]
183 ret <8 x i16> %retval
186 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
187 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
189 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
190 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
191 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
192 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
193 ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1]
194 ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1]
195 ret <2 x i32> %retval
198 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
199 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
201 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
202 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
203 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
204 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
205 ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0]
206 ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0]
207 ret <4 x i32> %retval
210 ;-----------------------------------------------------------------------------
211 ; RDMA Vector, by element, extracted
212 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
213 ; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied
215 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
216 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
218 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
219 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
220 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
221 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
222 %retval = extractelement <4 x i16> %retval_vec, i64 0
223 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
224 ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1]
225 ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1]
229 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
230 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
232 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
233 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
234 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
235 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
236 %retval = extractelement <8 x i16> %retval_vec, i64 0
237 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
238 ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1]
239 ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1]
243 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
244 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
246 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
247 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
248 %extract = extractelement <2 x i32> %prod, i64 0
249 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
250 ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
251 ; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0]
252 ; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0]
256 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
257 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
259 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
260 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
261 %extract = extractelement <4 x i32> %prod, i64 0
262 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
263 ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
264 ; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0]
265 ; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0]
269 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
270 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
272 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
273 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
274 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
275 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
276 %retval = extractelement <4 x i16> %retval_vec, i64 0
277 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
278 ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1]
279 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1]
283 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
284 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
286 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
287 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
288 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
289 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
290 %retval = extractelement <8 x i16> %retval_vec, i64 0
291 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
292 ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1]
293 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1]
297 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
298 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
300 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
301 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
302 %extract = extractelement <2 x i32> %prod, i64 0
303 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
304 ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
305 ; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0]
306 ; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0]
310 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
311 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
313 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
314 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
315 %extract = extractelement <4 x i32> %prod, i64 0
316 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
317 ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
318 ; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0]
319 ; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0]
323 ;-----------------------------------------------------------------------------
325 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
327 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
328 ; CHECK-LABEL: test_sqrdmlah_v1i16:
329 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
330 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
331 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
332 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
333 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
334 %retval = extractelement <4 x i16> %retval_vec, i64 0
335 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
336 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
337 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
341 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
342 ; CHECK-LABEL: test_sqrdmlah_v1i32:
343 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
344 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
345 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
346 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
347 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
348 %retval = extractelement <4 x i32> %retval_vec, i64 0
349 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
350 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
351 ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
356 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
357 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
358 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
359 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
360 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
361 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
362 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
363 %retval = extractelement <4 x i16> %retval_vec, i64 0
364 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
365 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
366 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
370 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
371 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
372 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
373 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
374 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
375 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
376 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
377 %retval = extractelement <4 x i32> %retval_vec, i64 0
378 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
379 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
380 ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
383 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
384 ; CHECK-LABEL: test_sqrdmlah_i32:
385 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
386 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
387 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
388 ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
389 ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
393 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
394 ; CHECK-LABEL: test_sqrdmlsh_i32:
395 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
396 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
397 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
398 ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
399 ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
403 ;-----------------------------------------------------------------------------
404 ; RDMA Scalar, by element
405 ; i16 tests are performed via tests in above chapter, with IR in ACLE style
406 ; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
408 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
409 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
410 %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
411 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
412 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
413 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
414 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
415 %retval = extractelement <4 x i16> %retval_vec, i32 0
416 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
417 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
418 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
422 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
423 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
424 %extract = extractelement <4 x i32> %rhs, i32 3
425 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
426 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
427 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
428 ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
429 ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
433 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
434 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
435 %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
436 %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
437 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
438 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
439 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
440 %retval = extractelement <8 x i16> %retval_vec, i32 0
441 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
442 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
443 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
447 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
448 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
449 %extract = extractelement <4 x i32> %rhs, i32 3
450 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
451 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
452 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
453 ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
454 ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]