1 ; RUN: llc -show-mc-encoding -march=arm -mcpu=cortex-a8 -mattr=+neon < %s | FileCheck %s
4 define <8 x i8> @vmul_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5 %tmp1 = load <8 x i8>* %A
6 %tmp2 = load <8 x i8>* %B
7 ; CHECK: vmul.i8 d16, d16, d17 @ encoding: [0xb1,0x09,0x40,0xf2]
8 %tmp3 = mul <8 x i8> %tmp1, %tmp2
13 define <4 x i16> @vmul_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
14 %tmp1 = load <4 x i16>* %A
15 %tmp2 = load <4 x i16>* %B
16 ; CHECK: vmul.i16 d16, d16, d17 @ encoding: [0xb1,0x09,0x50,0xf2]
17 %tmp3 = mul <4 x i16> %tmp1, %tmp2
22 define <2 x i32> @vmul_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
23 %tmp1 = load <2 x i32>* %A
24 %tmp2 = load <2 x i32>* %B
25 ; CHECK: vmul.i32 d16, d16, d17 @ encoding: [0xb1,0x09,0x60,0xf2]
26 %tmp3 = mul <2 x i32> %tmp1, %tmp2
31 define <2 x float> @vmul_2xfloat(<2 x float>* %A, <2 x float>* %B) nounwind {
32 %tmp1 = load <2 x float>* %A
33 %tmp2 = load <2 x float>* %B
34 ; CHECK: vmul.f32 d16, d16, d17 @ encoding: [0xb1,0x0d,0x40,0xf3]
35 %tmp3 = fmul <2 x float> %tmp1, %tmp2
40 define <16 x i8> @vmul_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
41 %tmp1 = load <16 x i8>* %A
42 %tmp2 = load <16 x i8>* %B
43 ; CHECK: vmul.i8 q8, q8, q9 @ encoding: [0xf2,0x09,0x40,0xf2]
44 %tmp3 = mul <16 x i8> %tmp1, %tmp2
49 define <8 x i16> @vmul_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
50 %tmp1 = load <8 x i16>* %A
51 %tmp2 = load <8 x i16>* %B
52 ; CHECK: vmul.i16 q8, q8, q9 @ encoding: [0xf2,0x09,0x50,0xf2]
53 %tmp3 = mul <8 x i16> %tmp1, %tmp2
58 define <4 x i32> @vmul_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
59 %tmp1 = load <4 x i32>* %A
60 %tmp2 = load <4 x i32>* %B
61 ; CHECK: vmul.i32 q8, q8, q9 @ encoding: [0xf2,0x09,0x60,0xf2]
62 %tmp3 = mul <4 x i32> %tmp1, %tmp2
67 define <4 x float> @vmul_4xfloat(<4 x float>* %A, <4 x float>* %B) nounwind {
68 %tmp1 = load <4 x float>* %A
69 %tmp2 = load <4 x float>* %B
70 ; CHECK: vmul.f32 q8, q8, q9 @ encoding: [0xf2,0x0d,0x40,0xf3]
71 %tmp3 = fmul <4 x float> %tmp1, %tmp2
75 declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
76 declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
79 define <8 x i8> @vmulp_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
80 %tmp1 = load <8 x i8>* %A
81 %tmp2 = load <8 x i8>* %B
82 ; CHECK: vmul.p8 d16, d16, d17 @ encoding: [0xb1,0x09,0x40,0xf3]
83 %tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
88 define <16 x i8> @vmulp_16xi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
89 %tmp1 = load <16 x i8>* %A
90 %tmp2 = load <16 x i8>* %B
91 ; CHECK: vmul.p8 q8, q8, q9 @ encoding: [0xf2,0x09,0x40,0xf3]
92 %tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
96 declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
97 declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
99 ; CHECK: vqdmulh_4xi16
100 define <4 x i16> @vqdmulh_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
101 %tmp1 = load <4 x i16>* %A
102 %tmp2 = load <4 x i16>* %B
103 ; CHECK: vqdmulh.s16 d16, d16, d17 @ encoding: [0xa1,0x0b,0x50,0xf2]
104 %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
108 ; CHECK: vqdmulh_2xi32
109 define <2 x i32> @vqdmulh_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
110 %tmp1 = load <2 x i32>* %A
111 %tmp2 = load <2 x i32>* %B
112 ; CHECK: vqdmulh.s32 d16, d16, d17 @ encoding: [0xa1,0x0b,0x60,0xf2]
113 %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
117 declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
118 declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
120 ; CHECK: vqdmulh_8xi16
121 define <8 x i16> @vqdmulh_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
122 %tmp1 = load <8 x i16>* %A
123 %tmp2 = load <8 x i16>* %B
124 ; CHECK: vqdmulh.s16 q8, q8, q9 @ encoding: [0xe2,0x0b,0x50,0xf2]
125 %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
129 ; CHECK: vqdmulh_4xi32
130 define <4 x i32> @vqdmulh_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
131 %tmp1 = load <4 x i32>* %A
132 %tmp2 = load <4 x i32>* %B
133 ; CHECK: vqdmulh.s32 q8, q8, q9 @ encoding: [0xe2,0x0b,0x60,0xf2]
134 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
138 declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
139 declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
141 ; CHECK: vqrdmulh_4xi16
142 define <4 x i16> @vqrdmulh_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
143 %tmp1 = load <4 x i16>* %A
144 %tmp2 = load <4 x i16>* %B
145 ; CHECK: vqrdmulh.s16 d16, d16, d17 @ encoding: [0xa1,0x0b,0x50,0xf3]
146 %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
150 ; CHECK: vqrdmulh_2xi32
151 define <2 x i32> @vqrdmulh_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
152 %tmp1 = load <2 x i32>* %A
153 %tmp2 = load <2 x i32>* %B
154 ; CHECK: vqrdmulh.s32 d16, d16, d17 @ encoding: [0xa1,0x0b,0x60,0xf3]
155 %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
159 declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
160 declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
162 ; CHECK: vqrdmulh_8xi16
163 define <8 x i16> @vqrdmulh_8xi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
164 %tmp1 = load <8 x i16>* %A
165 %tmp2 = load <8 x i16>* %B
166 ; CHECK: vqrdmulh.s16 q8, q8, q9 @ encoding: [0xe2,0x0b,0x50,0xf3]
167 %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
171 ; CHECK: vqrdmulh_4xi32
172 define <4 x i32> @vqrdmulh_4xi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
173 %tmp1 = load <4 x i32>* %A
174 %tmp2 = load <4 x i32>* %B
175 ; CHECK: vqrdmulh.s32 q8, q8, q9 @ encoding: [0xe2,0x0b,0x60,0xf3]
176 %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
180 ; CHECK: vmulls_8xi16
181 define <8 x i16> @vmulls_8xi16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
182 %tmp1 = load <8 x i8>* %A
183 %tmp2 = load <8 x i8>* %B
184 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
185 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
186 ; CHECK: vmull.s8 q8, d16, d17 @ encoding: [0xa1,0x0c,0xc0,0xf2]
187 %tmp5 = mul <8 x i16> %tmp3, %tmp4
191 ; CHECK: vmulls_4xi16
192 define <4 x i32> @vmulls_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
193 %tmp1 = load <4 x i16>* %A
194 %tmp2 = load <4 x i16>* %B
195 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
196 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
197 ; CHECK: vmull.s16 q8, d16, d17 @ encoding: [0xa1,0x0c,0xd0,0xf2]
198 %tmp5 = mul <4 x i32> %tmp3, %tmp4
202 ; CHECK: vmulls_2xi32
203 define <2 x i64> @vmulls_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
204 %tmp1 = load <2 x i32>* %A
205 %tmp2 = load <2 x i32>* %B
206 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
207 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
208 ; CHECK: vmull.s32 q8, d16, d17 @ encoding: [0xa1,0x0c,0xe0,0xf2]
209 %tmp5 = mul <2 x i64> %tmp3, %tmp4
214 define <8 x i16> @vmullu_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
215 %tmp1 = load <8 x i8>* %A
216 %tmp2 = load <8 x i8>* %B
217 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
218 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
219 ; CHECK: vmull.u8 q8, d16, d17 @ encoding: [0xa1,0x0c,0xc0,0xf3]
220 %tmp5 = mul <8 x i16> %tmp3, %tmp4
224 ; CHECK: vmullu_4xi16
225 define <4 x i32> @vmullu_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
226 %tmp1 = load <4 x i16>* %A
227 %tmp2 = load <4 x i16>* %B
228 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
229 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
230 ; CHECK: vmull.u16 q8, d16, d17 @ encoding: [0xa1,0x0c,0xd0,0xf3]
231 %tmp5 = mul <4 x i32> %tmp3, %tmp4
235 ; CHECK: vmullu_2xi32
236 define <2 x i64> @vmullu_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
237 %tmp1 = load <2 x i32>* %A
238 %tmp2 = load <2 x i32>* %B
239 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
240 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
241 ; CHECK: vmull.u32 q8, d16, d17 @ encoding: [0xa1,0x0c,0xe0,0xf3]
242 %tmp5 = mul <2 x i64> %tmp3, %tmp4
246 declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
249 define <8 x i16> @vmullp_8xi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
250 %tmp1 = load <8 x i8>* %A
251 %tmp2 = load <8 x i8>* %B
252 ; CHECK: vmull.p8 q8, d16, d17 @ encoding: [0xa1,0x0e,0xc0,0xf2]
253 %tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
257 declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
258 declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
260 ; CHECK: vqdmull_4xi16
261 define <4 x i32> @vqdmull_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
262 %tmp1 = load <4 x i16>* %A
263 %tmp2 = load <4 x i16>* %B
264 ; CHECK: vqdmull.s16 q8, d16, d17 @ encoding: [0xa1,0x0d,0xd0,0xf2]
265 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
269 ; CHECK: vqdmull_2xi32
270 define <2 x i64> @vqdmull_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
271 %tmp1 = load <2 x i32>* %A
272 %tmp2 = load <2 x i32>* %B
273 ; CHECK: vqdmull.s32 q8, d16, d17 @ encoding: [0xa1,0x0d,0xe0,0xf2]
274 %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)