1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
8 ; A length of zero is equivalent to a bit length of 64.
9 define <2 x i64> @extrqi_len0_idx0(<2 x i64> %a) {
10 ; ALL-LABEL: extrqi_len0_idx0:
12 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
14 %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 0, i8 0)
18 define <2 x i64> @extrqi_len8_idx16(<2 x i64> %a) {
19 ; ALL-LABEL: extrqi_len8_idx16:
21 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
23 %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 8, i8 16)
27 ; If the length + index exceeds the bottom 64 bits the result is undefined.
28 define <2 x i64> @extrqi_len32_idx48(<2 x i64> %a) {
29 ; ALL-LABEL: extrqi_len32_idx48:
31 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
33 %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 32, i8 48)
37 define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
38 ; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
40 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
43 ; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu:
45 ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
47 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
51 define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
52 ; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
54 ; BTVER1-NEXT: movaps %xmm0, %xmm1
55 ; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
56 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
57 ; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
60 ; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
62 ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
64 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
68 define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
69 ; BTVER1-LABEL: shuf_2zzzzzzz3zzzzzzz:
71 ; BTVER1-NEXT: movaps %xmm0, %xmm1
72 ; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
73 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
74 ; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
77 ; BTVER2-LABEL: shuf_2zzzzzzz3zzzzzzz:
79 ; BTVER2-NEXT: vpsrld $16, %xmm0, %xmm0
80 ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
82 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
86 define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
87 ; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
89 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
92 ; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu:
94 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
96 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
100 define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
101 ; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
103 ; BTVER1-NEXT: movaps %xmm0, %xmm1
104 ; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
105 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
106 ; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
109 ; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
111 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
113 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
117 define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
118 ; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu:
120 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
122 %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
126 define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
127 ; ALL-LABEL: shuf_1zzzuuuu:
129 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
131 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
135 define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
136 ; ALL-LABEL: shuf_12zzuuuu:
138 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
140 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
144 define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
145 ; ALL-LABEL: shuf_012zuuuu:
147 ; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
149 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
153 define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
154 ; BTVER1-LABEL: shuf_0zzz1zzz:
156 ; BTVER1-NEXT: movaps %xmm0, %xmm1
157 ; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
158 ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
159 ; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
162 ; BTVER2-LABEL: shuf_0zzz1zzz:
164 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
166 %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8>
170 define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
171 ; BTVER1-LABEL: shuf_0z1z:
173 ; BTVER1-NEXT: pxor %xmm1, %xmm1
174 ; BTVER1-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
177 ; BTVER2-LABEL: shuf_0z1z:
179 ; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
181 %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
189 ; A length of zero is equivalent to a bit length of 64.
190 define <2 x i64> @insertqi_len0_idx0(<2 x i64> %a, <2 x i64> %b) {
191 ; ALL-LABEL: insertqi_len0_idx0:
193 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7],xmm0[u,u,u,u,u,u,u,u]
195 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 0, i8 0)
199 define <2 x i64> @insertqi_len8_idx16(<2 x i64> %a, <2 x i64> %b) {
200 ; ALL-LABEL: insertqi_len8_idx16:
202 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
204 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 8, i8 16)
208 ; If the length + index exceeds the bottom 64 bits the result is undefined
209 define <2 x i64> @insertqi_len32_idx48(<2 x i64> %a, <2 x i64> %b) {
210 ; ALL-LABEL: insertqi_len32_idx48:
212 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
214 %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 32, i8 48)
218 define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
219 ; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
221 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
223 %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
227 define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
228 ; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu:
230 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u]
232 %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
236 define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
237 ; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu:
239 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
241 %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
245 define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
246 ; ALL-LABEL: shuf_0823uuuu:
248 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u]
250 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
254 define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
255 ; ALL-LABEL: shuf_0183uuuu:
257 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u]
259 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
263 define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
264 ; ALL-LABEL: shuf_0128uuuu:
266 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u]
268 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
272 define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
273 ; ALL-LABEL: shuf_0893uuuu:
275 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
277 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
281 define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
282 ; ALL-LABEL: shuf_089Auuuu:
284 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u]
286 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
290 define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
291 ; ALL-LABEL: shuf_089uuuuu:
293 ; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
295 %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
304 define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
305 ; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
307 ; BTVER1-NEXT: psrld $16, %xmm1
308 ; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
309 ; BTVER1-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
312 ; BTVER2-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
314 ; BTVER2-NEXT: vpsrld $16, %xmm1, %xmm1
315 ; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
316 ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
318 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
322 declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
323 declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind