1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon -lower-interleaved-accesses=true < %s | FileCheck %s
3 ; CHECK-LABEL: load_factor2:
4 ; CHECK: vld2.8 {d16, d17}, [r0]
5 define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
6 %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
7 %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
8 %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
9 %add = add nsw <8 x i8> %strided.v0, %strided.v1
13 ; CHECK-LABEL: load_factor3:
14 ; CHECK: vld3.32 {d16, d17, d18}, [r0]
15 define <2 x i32> @load_factor3(i32* %ptr) {
16 %base = bitcast i32* %ptr to <6 x i32>*
17 %wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
18 %strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
19 %strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
20 %add = add nsw <2 x i32> %strided.v2, %strided.v1
24 ; CHECK-LABEL: load_factor4:
25 ; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
26 ; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
27 define <4 x i32> @load_factor4(i32* %ptr) {
28 %base = bitcast i32* %ptr to <16 x i32>*
29 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
30 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
31 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
32 %add = add nsw <4 x i32> %strided.v0, %strided.v2
36 ; CHECK-LABEL: store_factor2:
37 ; CHECK: vst2.8 {d16, d17}, [r0]
38 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
39 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
40 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
44 ; CHECK-LABEL: store_factor3:
45 ; CHECK: vst3.32 {d16, d18, d20}, [r0]!
46 ; CHECK: vst3.32 {d17, d19, d21}, [r0]
47 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
48 %base = bitcast i32* %ptr to <12 x i32>*
49 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
50 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
51 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
52 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
56 ; CHECK-LABEL: store_factor4:
57 ; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
58 ; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
59 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
60 %base = bitcast i32* %ptr to <16 x i32>*
61 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
62 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
63 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
64 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
68 ; The following cases test that interleaved access of pointer vectors can be
69 ; matched to ldN/stN instruction.
71 ; CHECK-LABEL: load_ptrvec_factor2:
72 ; CHECK: vld2.32 {d16, d17}, [r0]
73 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
74 %base = bitcast i32** %ptr to <4 x i32*>*
75 %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
76 %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
77 ret <2 x i32*> %strided.v0
80 ; CHECK-LABEL: load_ptrvec_factor3:
81 ; CHECK: vld3.32 {d16, d17, d18}, [r0]
82 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
83 %base = bitcast i32** %ptr to <6 x i32*>*
84 %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
85 %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
86 store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
87 %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
88 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
92 ; CHECK-LABEL: load_ptrvec_factor4:
93 ; CHECK: vld4.32 {d16, d17, d18, d19}, [r0]
94 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
95 %base = bitcast i32** %ptr to <8 x i32*>*
96 %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
97 %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
98 %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
99 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
100 store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
104 ; CHECK-LABEL: store_ptrvec_factor2:
105 ; CHECK: vst2.32 {d16, d17}, [r0]
106 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
107 %base = bitcast i32** %ptr to <4 x i32*>*
108 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
109 store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
113 ; CHECK-LABEL: store_ptrvec_factor3:
114 ; CHECK: vst3.32 {d16, d17, d18}, [r0]
115 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
116 %base = bitcast i32** %ptr to <6 x i32*>*
117 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
118 %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
119 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
120 store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
124 ; CHECK-LABEL: store_ptrvec_factor4:
125 ; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
126 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
127 %base = bitcast i32* %ptr to <8 x i32*>*
128 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
129 %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
130 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
131 store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
135 ; Following cases check that shuffle maskes with undef indices can be matched
136 ; into ldN/stN instruction.
138 ; CHECK-LABEL: load_undef_mask_factor2:
139 ; CHECK: vld2.32 {d16, d17, d18, d19}, [r0]
140 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
141 %base = bitcast i32* %ptr to <8 x i32>*
142 %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
143 %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
144 %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
145 %add = add nsw <4 x i32> %strided.v0, %strided.v1
149 ; CHECK-LABEL: load_undef_mask_factor3:
150 ; CHECK: vld3.32 {d16, d18, d20}, [r0]!
151 ; CHECK: vld3.32 {d17, d19, d21}, [r0]
152 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
153 %base = bitcast i32* %ptr to <12 x i32>*
154 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
155 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
156 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
157 %add = add nsw <4 x i32> %strided.v2, %strided.v1
161 ; CHECK-LABEL: load_undef_mask_factor4:
162 ; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
163 ; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
164 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
165 %base = bitcast i32* %ptr to <16 x i32>*
166 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
167 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
168 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
169 %add = add nsw <4 x i32> %strided.v0, %strided.v2
173 ; CHECK-LABEL: store_undef_mask_factor2:
174 ; CHECK: vst2.32 {d16, d17, d18, d19}, [r0]
175 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
176 %base = bitcast i32* %ptr to <8 x i32>*
177 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
178 store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
182 ; CHECK-LABEL: store_undef_mask_factor3:
183 ; CHECK: vst3.32 {d16, d18, d20}, [r0]!
184 ; CHECK: vst3.32 {d17, d19, d21}, [r0]
185 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
186 %base = bitcast i32* %ptr to <12 x i32>*
187 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
188 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
189 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
190 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
194 ; CHECK-LABEL: store_undef_mask_factor4:
195 ; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
196 ; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
197 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
198 %base = bitcast i32* %ptr to <16 x i32>*
199 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
200 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
201 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
202 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4