1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW
5 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
6 ; SSE2-LABEL: avg_v4i8:
8 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
9 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
10 ; SSE2-NEXT: pavgb %xmm0, %xmm1
11 ; SSE2-NEXT: movd %xmm1, (%rax)
14 ; AVX2-LABEL: avg_v4i8:
16 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
17 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
18 ; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
19 ; AVX2-NEXT: vmovd %xmm0, (%rax)
22 ; AVX512BW-LABEL: avg_v4i8:
24 ; AVX512BW-NEXT: vmovd (%rdi), %xmm0
25 ; AVX512BW-NEXT: vmovd (%rsi), %xmm1
26 ; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0
27 ; AVX512BW-NEXT: vmovd %xmm0, (%rax)
29 %1 = load <4 x i8>, <4 x i8>* %a
30 %2 = load <4 x i8>, <4 x i8>* %b
31 %3 = zext <4 x i8> %1 to <4 x i32>
32 %4 = zext <4 x i8> %2 to <4 x i32>
33 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
34 %6 = add nuw nsw <4 x i32> %5, %4
35 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
36 %8 = trunc <4 x i32> %7 to <4 x i8>
37 store <4 x i8> %8, <4 x i8>* undef, align 4
41 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
42 ; SSE2-LABEL: avg_v8i8:
44 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
45 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
46 ; SSE2-NEXT: pavgb %xmm0, %xmm1
47 ; SSE2-NEXT: movq %xmm1, (%rax)
50 ; AVX2-LABEL: avg_v8i8:
52 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
53 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
54 ; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
55 ; AVX2-NEXT: vmovq %xmm0, (%rax)
58 ; AVX512BW-LABEL: avg_v8i8:
60 ; AVX512BW-NEXT: vmovq (%rdi), %xmm0
61 ; AVX512BW-NEXT: vmovq (%rsi), %xmm1
62 ; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0
63 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
65 %1 = load <8 x i8>, <8 x i8>* %a
66 %2 = load <8 x i8>, <8 x i8>* %b
67 %3 = zext <8 x i8> %1 to <8 x i32>
68 %4 = zext <8 x i8> %2 to <8 x i32>
69 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
70 %6 = add nuw nsw <8 x i32> %5, %4
71 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
72 %8 = trunc <8 x i32> %7 to <8 x i8>
73 store <8 x i8> %8, <8 x i8>* undef, align 4
77 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
78 ; SSE2-LABEL: avg_v16i8:
80 ; SSE2-NEXT: movdqa (%rsi), %xmm0
81 ; SSE2-NEXT: pavgb (%rdi), %xmm0
82 ; SSE2-NEXT: movdqu %xmm0, (%rax)
85 ; AVX-LABEL: avg_v16i8:
87 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
88 ; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
89 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
91 %1 = load <16 x i8>, <16 x i8>* %a
92 %2 = load <16 x i8>, <16 x i8>* %b
93 %3 = zext <16 x i8> %1 to <16 x i32>
94 %4 = zext <16 x i8> %2 to <16 x i32>
95 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
96 %6 = add nuw nsw <16 x i32> %5, %4
97 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
98 %8 = trunc <16 x i32> %7 to <16 x i8>
99 store <16 x i8> %8, <16 x i8>* undef, align 4
103 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
104 ; AVX2-LABEL: avg_v32i8:
106 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
107 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
108 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
109 ; AVX2-NEXT: vzeroupper
112 ; AVX512BW-LABEL: avg_v32i8:
114 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
115 ; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0
116 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
117 ; AVX512BW-NEXT: retq
118 %1 = load <32 x i8>, <32 x i8>* %a
119 %2 = load <32 x i8>, <32 x i8>* %b
120 %3 = zext <32 x i8> %1 to <32 x i32>
121 %4 = zext <32 x i8> %2 to <32 x i32>
122 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
123 %6 = add nuw nsw <32 x i32> %5, %4
124 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
125 %8 = trunc <32 x i32> %7 to <32 x i8>
126 store <32 x i8> %8, <32 x i8>* undef, align 4
130 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
131 ; AVX512BW-LABEL: avg_v64i8:
133 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
134 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
135 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
136 ; AVX512BW-NEXT: retq
137 %1 = load <64 x i8>, <64 x i8>* %a
138 %2 = load <64 x i8>, <64 x i8>* %b
139 %3 = zext <64 x i8> %1 to <64 x i32>
140 %4 = zext <64 x i8> %2 to <64 x i32>
141 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
142 %6 = add nuw nsw <64 x i32> %5, %4
143 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
144 %8 = trunc <64 x i32> %7 to <64 x i8>
145 store <64 x i8> %8, <64 x i8>* undef, align 4
149 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
150 ; SSE2-LABEL: avg_v4i16:
152 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
153 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
154 ; SSE2-NEXT: pavgw %xmm0, %xmm1
155 ; SSE2-NEXT: movq %xmm1, (%rax)
158 ; AVX2-LABEL: avg_v4i16:
160 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
161 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
162 ; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0
163 ; AVX2-NEXT: vmovq %xmm0, (%rax)
166 ; AVX512BW-LABEL: avg_v4i16:
168 ; AVX512BW-NEXT: vmovq (%rdi), %xmm0
169 ; AVX512BW-NEXT: vmovq (%rsi), %xmm1
170 ; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0
171 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
172 ; AVX512BW-NEXT: retq
173 %1 = load <4 x i16>, <4 x i16>* %a
174 %2 = load <4 x i16>, <4 x i16>* %b
175 %3 = zext <4 x i16> %1 to <4 x i32>
176 %4 = zext <4 x i16> %2 to <4 x i32>
177 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
178 %6 = add nuw nsw <4 x i32> %5, %4
179 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
180 %8 = trunc <4 x i32> %7 to <4 x i16>
181 store <4 x i16> %8, <4 x i16>* undef, align 4
185 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
186 ; SSE2-LABEL: avg_v8i16:
188 ; SSE2-NEXT: movdqa (%rsi), %xmm0
189 ; SSE2-NEXT: pavgw (%rdi), %xmm0
190 ; SSE2-NEXT: movdqu %xmm0, (%rax)
193 ; AVX-LABEL: avg_v8i16:
195 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
196 ; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
197 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
199 %1 = load <8 x i16>, <8 x i16>* %a
200 %2 = load <8 x i16>, <8 x i16>* %b
201 %3 = zext <8 x i16> %1 to <8 x i32>
202 %4 = zext <8 x i16> %2 to <8 x i32>
203 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
204 %6 = add nuw nsw <8 x i32> %5, %4
205 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
206 %8 = trunc <8 x i32> %7 to <8 x i16>
207 store <8 x i16> %8, <8 x i16>* undef, align 4
211 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
212 ; AVX2-LABEL: avg_v16i16:
214 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
215 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
216 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
217 ; AVX2-NEXT: vzeroupper
220 ; AVX512BW-LABEL: avg_v16i16:
222 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
223 ; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0
224 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
225 ; AVX512BW-NEXT: retq
226 %1 = load <16 x i16>, <16 x i16>* %a
227 %2 = load <16 x i16>, <16 x i16>* %b
228 %3 = zext <16 x i16> %1 to <16 x i32>
229 %4 = zext <16 x i16> %2 to <16 x i32>
230 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
231 %6 = add nuw nsw <16 x i32> %5, %4
232 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
233 %8 = trunc <16 x i32> %7 to <16 x i16>
234 store <16 x i16> %8, <16 x i16>* undef, align 4
238 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
239 ; AVX512BW-LABEL: avg_v32i16:
241 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
242 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
243 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
244 ; AVX512BW-NEXT: retq
245 %1 = load <32 x i16>, <32 x i16>* %a
246 %2 = load <32 x i16>, <32 x i16>* %b
247 %3 = zext <32 x i16> %1 to <32 x i32>
248 %4 = zext <32 x i16> %2 to <32 x i32>
249 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
250 %6 = add nuw nsw <32 x i32> %5, %4
251 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
252 %8 = trunc <32 x i32> %7 to <32 x i16>
253 store <32 x i16> %8, <32 x i16>* undef, align 4
257 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
258 ; SSE2-LABEL: avg_v4i8_2:
260 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
261 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
262 ; SSE2-NEXT: pavgb %xmm0, %xmm1
263 ; SSE2-NEXT: movd %xmm1, (%rax)
266 ; AVX2-LABEL: avg_v4i8_2:
268 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
269 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
270 ; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
271 ; AVX2-NEXT: vmovd %xmm0, (%rax)
274 ; AVX512BW-LABEL: avg_v4i8_2:
276 ; AVX512BW-NEXT: vmovd (%rdi), %xmm0
277 ; AVX512BW-NEXT: vmovd (%rsi), %xmm1
278 ; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0
279 ; AVX512BW-NEXT: vmovd %xmm0, (%rax)
280 ; AVX512BW-NEXT: retq
281 %1 = load <4 x i8>, <4 x i8>* %a
282 %2 = load <4 x i8>, <4 x i8>* %b
283 %3 = zext <4 x i8> %1 to <4 x i32>
284 %4 = zext <4 x i8> %2 to <4 x i32>
285 %5 = add nuw nsw <4 x i32> %3, %4
286 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
287 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
288 %8 = trunc <4 x i32> %7 to <4 x i8>
289 store <4 x i8> %8, <4 x i8>* undef, align 4
293 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
294 ; SSE2-LABEL: avg_v8i8_2:
296 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
297 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
298 ; SSE2-NEXT: pavgb %xmm0, %xmm1
299 ; SSE2-NEXT: movq %xmm1, (%rax)
302 ; AVX2-LABEL: avg_v8i8_2:
304 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
305 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
306 ; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
307 ; AVX2-NEXT: vmovq %xmm0, (%rax)
310 ; AVX512BW-LABEL: avg_v8i8_2:
312 ; AVX512BW-NEXT: vmovq (%rdi), %xmm0
313 ; AVX512BW-NEXT: vmovq (%rsi), %xmm1
314 ; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0
315 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
316 ; AVX512BW-NEXT: retq
317 %1 = load <8 x i8>, <8 x i8>* %a
318 %2 = load <8 x i8>, <8 x i8>* %b
319 %3 = zext <8 x i8> %1 to <8 x i32>
320 %4 = zext <8 x i8> %2 to <8 x i32>
321 %5 = add nuw nsw <8 x i32> %3, %4
322 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
323 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
324 %8 = trunc <8 x i32> %7 to <8 x i8>
325 store <8 x i8> %8, <8 x i8>* undef, align 4
329 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
330 ; SSE2-LABEL: avg_v16i8_2:
332 ; SSE2-NEXT: movdqa (%rdi), %xmm0
333 ; SSE2-NEXT: pavgb (%rsi), %xmm0
334 ; SSE2-NEXT: movdqu %xmm0, (%rax)
337 ; AVX-LABEL: avg_v16i8_2:
339 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
340 ; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
341 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
343 %1 = load <16 x i8>, <16 x i8>* %a
344 %2 = load <16 x i8>, <16 x i8>* %b
345 %3 = zext <16 x i8> %1 to <16 x i32>
346 %4 = zext <16 x i8> %2 to <16 x i32>
347 %5 = add nuw nsw <16 x i32> %3, %4
348 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
349 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
350 %8 = trunc <16 x i32> %7 to <16 x i8>
351 store <16 x i8> %8, <16 x i8>* undef, align 4
355 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
356 ; AVX2-LABEL: avg_v32i8_2:
358 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
359 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
360 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
361 ; AVX2-NEXT: vzeroupper
364 ; AVX512BW-LABEL: avg_v32i8_2:
366 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
367 ; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0
368 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
369 ; AVX512BW-NEXT: retq
370 %1 = load <32 x i8>, <32 x i8>* %a
371 %2 = load <32 x i8>, <32 x i8>* %b
372 %3 = zext <32 x i8> %1 to <32 x i32>
373 %4 = zext <32 x i8> %2 to <32 x i32>
374 %5 = add nuw nsw <32 x i32> %3, %4
375 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
376 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
377 %8 = trunc <32 x i32> %7 to <32 x i8>
378 store <32 x i8> %8, <32 x i8>* undef, align 4
382 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
383 ; AVX512BW-LABEL: avg_v64i8_2:
385 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
386 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
387 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
388 ; AVX512BW-NEXT: retq
389 %1 = load <64 x i8>, <64 x i8>* %a
390 %2 = load <64 x i8>, <64 x i8>* %b
391 %3 = zext <64 x i8> %1 to <64 x i32>
392 %4 = zext <64 x i8> %2 to <64 x i32>
393 %5 = add nuw nsw <64 x i32> %4, %4
394 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
395 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
396 %8 = trunc <64 x i32> %7 to <64 x i8>
397 store <64 x i8> %8, <64 x i8>* undef, align 4
402 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
403 ; SSE2-LABEL: avg_v4i16_2:
405 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
406 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
407 ; SSE2-NEXT: pavgw %xmm0, %xmm1
408 ; SSE2-NEXT: movq %xmm1, (%rax)
411 ; AVX2-LABEL: avg_v4i16_2:
413 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
414 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
415 ; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0
416 ; AVX2-NEXT: vmovq %xmm0, (%rax)
419 ; AVX512BW-LABEL: avg_v4i16_2:
421 ; AVX512BW-NEXT: vmovq (%rdi), %xmm0
422 ; AVX512BW-NEXT: vmovq (%rsi), %xmm1
423 ; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0
424 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
425 ; AVX512BW-NEXT: retq
426 %1 = load <4 x i16>, <4 x i16>* %a
427 %2 = load <4 x i16>, <4 x i16>* %b
428 %3 = zext <4 x i16> %1 to <4 x i32>
429 %4 = zext <4 x i16> %2 to <4 x i32>
430 %5 = add nuw nsw <4 x i32> %3, %4
431 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
432 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
433 %8 = trunc <4 x i32> %7 to <4 x i16>
434 store <4 x i16> %8, <4 x i16>* undef, align 4
438 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
439 ; SSE2-LABEL: avg_v8i16_2:
441 ; SSE2-NEXT: movdqa (%rdi), %xmm0
442 ; SSE2-NEXT: pavgw (%rsi), %xmm0
443 ; SSE2-NEXT: movdqu %xmm0, (%rax)
446 ; AVX-LABEL: avg_v8i16_2:
448 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
449 ; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
450 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
452 %1 = load <8 x i16>, <8 x i16>* %a
453 %2 = load <8 x i16>, <8 x i16>* %b
454 %3 = zext <8 x i16> %1 to <8 x i32>
455 %4 = zext <8 x i16> %2 to <8 x i32>
456 %5 = add nuw nsw <8 x i32> %3, %4
457 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
458 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
459 %8 = trunc <8 x i32> %7 to <8 x i16>
460 store <8 x i16> %8, <8 x i16>* undef, align 4
464 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
465 ; AVX2-LABEL: avg_v16i16_2:
467 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
468 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
469 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
470 ; AVX2-NEXT: vzeroupper
473 ; AVX512BW-LABEL: avg_v16i16_2:
475 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
476 ; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0
477 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
478 ; AVX512BW-NEXT: retq
479 %1 = load <16 x i16>, <16 x i16>* %a
480 %2 = load <16 x i16>, <16 x i16>* %b
481 %3 = zext <16 x i16> %1 to <16 x i32>
482 %4 = zext <16 x i16> %2 to <16 x i32>
483 %5 = add nuw nsw <16 x i32> %3, %4
484 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
485 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
486 %8 = trunc <16 x i32> %7 to <16 x i16>
487 store <16 x i16> %8, <16 x i16>* undef, align 4
491 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
492 ; AVX512BW-LABEL: avg_v32i16_2:
494 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
495 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
496 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
497 ; AVX512BW-NEXT: retq
498 %1 = load <32 x i16>, <32 x i16>* %a
499 %2 = load <32 x i16>, <32 x i16>* %b
500 %3 = zext <32 x i16> %1 to <32 x i32>
501 %4 = zext <32 x i16> %2 to <32 x i32>
502 %5 = add nuw nsw <32 x i32> %3, %4
503 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
504 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
505 %8 = trunc <32 x i32> %7 to <32 x i16>
506 store <32 x i16> %8, <32 x i16>* undef, align 4
510 define void @avg_v4i8_const(<4 x i8>* %a) {
511 ; SSE2-LABEL: avg_v4i8_const:
513 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
514 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
515 ; SSE2-NEXT: movd %xmm0, (%rax)
518 ; AVX2-LABEL: avg_v4i8_const:
520 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
521 ; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
522 ; AVX2-NEXT: vmovd %xmm0, (%rax)
525 ; AVX512BW-LABEL: avg_v4i8_const:
527 ; AVX512BW-NEXT: vmovd (%rdi), %xmm0
528 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
529 ; AVX512BW-NEXT: vmovd %xmm0, (%rax)
530 ; AVX512BW-NEXT: retq
531 %1 = load <4 x i8>, <4 x i8>* %a
532 %2 = zext <4 x i8> %1 to <4 x i32>
533 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
534 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
535 %5 = trunc <4 x i32> %4 to <4 x i8>
536 store <4 x i8> %5, <4 x i8>* undef, align 4
540 define void @avg_v8i8_const(<8 x i8>* %a) {
541 ; SSE2-LABEL: avg_v8i8_const:
543 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
544 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
545 ; SSE2-NEXT: movq %xmm0, (%rax)
548 ; AVX2-LABEL: avg_v8i8_const:
550 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
551 ; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
552 ; AVX2-NEXT: vmovq %xmm0, (%rax)
555 ; AVX512BW-LABEL: avg_v8i8_const:
557 ; AVX512BW-NEXT: vmovq (%rdi), %xmm0
558 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
559 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
560 ; AVX512BW-NEXT: retq
561 %1 = load <8 x i8>, <8 x i8>* %a
562 %2 = zext <8 x i8> %1 to <8 x i32>
563 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
564 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
565 %5 = trunc <8 x i32> %4 to <8 x i8>
566 store <8 x i8> %5, <8 x i8>* undef, align 4
570 define void @avg_v16i8_const(<16 x i8>* %a) {
571 ; SSE2-LABEL: avg_v16i8_const:
573 ; SSE2-NEXT: movdqa (%rdi), %xmm0
574 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
575 ; SSE2-NEXT: movdqu %xmm0, (%rax)
578 ; AVX-LABEL: avg_v16i8_const:
580 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
581 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
582 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
584 %1 = load <16 x i8>, <16 x i8>* %a
585 %2 = zext <16 x i8> %1 to <16 x i32>
586 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
587 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
588 %5 = trunc <16 x i32> %4 to <16 x i8>
589 store <16 x i8> %5, <16 x i8>* undef, align 4
593 define void @avg_v32i8_const(<32 x i8>* %a) {
594 ; AVX2-LABEL: avg_v32i8_const:
596 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
597 ; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
598 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
599 ; AVX2-NEXT: vzeroupper
602 ; AVX512BW-LABEL: avg_v32i8_const:
604 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
605 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
606 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
607 ; AVX512BW-NEXT: retq
608 %1 = load <32 x i8>, <32 x i8>* %a
609 %2 = zext <32 x i8> %1 to <32 x i32>
610 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
611 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
612 %5 = trunc <32 x i32> %4 to <32 x i8>
613 store <32 x i8> %5, <32 x i8>* undef, align 4
617 define void @avg_v64i8_const(<64 x i8>* %a) {
618 ; AVX512BW-LABEL: avg_v64i8_const:
620 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
621 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
622 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
623 ; AVX512BW-NEXT: retq
624 %1 = load <64 x i8>, <64 x i8>* %a
625 %2 = zext <64 x i8> %1 to <64 x i32>
626 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
627 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
628 %5 = trunc <64 x i32> %4 to <64 x i8>
629 store <64 x i8> %5, <64 x i8>* undef, align 4
633 define void @avg_v4i16_const(<4 x i16>* %a) {
634 ; SSE2-LABEL: avg_v4i16_const:
636 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
637 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
638 ; SSE2-NEXT: movq %xmm0, (%rax)
641 ; AVX2-LABEL: avg_v4i16_const:
643 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
644 ; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
645 ; AVX2-NEXT: vmovq %xmm0, (%rax)
648 ; AVX512BW-LABEL: avg_v4i16_const:
650 ; AVX512BW-NEXT: vmovq (%rdi), %xmm0
651 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
652 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
653 ; AVX512BW-NEXT: retq
654 %1 = load <4 x i16>, <4 x i16>* %a
655 %2 = zext <4 x i16> %1 to <4 x i32>
656 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
657 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
658 %5 = trunc <4 x i32> %4 to <4 x i16>
659 store <4 x i16> %5, <4 x i16>* undef, align 4
663 define void @avg_v8i16_const(<8 x i16>* %a) {
664 ; SSE2-LABEL: avg_v8i16_const:
666 ; SSE2-NEXT: movdqa (%rdi), %xmm0
667 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
668 ; SSE2-NEXT: movdqu %xmm0, (%rax)
671 ; AVX-LABEL: avg_v8i16_const:
673 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
674 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
675 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
677 %1 = load <8 x i16>, <8 x i16>* %a
678 %2 = zext <8 x i16> %1 to <8 x i32>
679 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
680 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
681 %5 = trunc <8 x i32> %4 to <8 x i16>
682 store <8 x i16> %5, <8 x i16>* undef, align 4
686 define void @avg_v16i16_const(<16 x i16>* %a) {
687 ; AVX2-LABEL: avg_v16i16_const:
689 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
690 ; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
691 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
692 ; AVX2-NEXT: vzeroupper
695 ; AVX512BW-LABEL: avg_v16i16_const:
697 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
698 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
699 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
700 ; AVX512BW-NEXT: retq
701 %1 = load <16 x i16>, <16 x i16>* %a
702 %2 = zext <16 x i16> %1 to <16 x i32>
703 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
704 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
705 %5 = trunc <16 x i32> %4 to <16 x i16>
706 store <16 x i16> %5, <16 x i16>* undef, align 4
710 define void @avg_v32i16_const(<32 x i16>* %a) {
711 ; AVX512BW-LABEL: avg_v32i16_const:
713 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
714 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
715 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
716 ; AVX512BW-NEXT: retq
717 %1 = load <32 x i16>, <32 x i16>* %a
718 %2 = zext <32 x i16> %1 to <32 x i32>
719 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
720 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
721 %5 = trunc <32 x i32> %4 to <32 x i16>
722 store <32 x i16> %5, <32 x i16>* undef, align 4