1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
7 ; because that is slower than two 16-byte loads.
8 ; Other AVX-capable chips don't have that problem.
10 define <8 x float> @load32bytes(<8 x float>* %Ap) {
11 ; CHECK-LABEL: load32bytes
23 %A = load <8 x float>* %Ap, align 16
27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
28 ; because that is slowerthan two 16-byte stores.
29 ; Other AVX-capable chips don't have that problem.
31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
32 ; CHECK-LABEL: store32bytes
34 ; SANDYB: vextractf128
44 store <8 x float> %A, <8 x float>* %P, align 16
48 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
51 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
53 ; Use the vinsertf128 intrinsic to model source code
54 ; that explicitly uses AVX intrinsics.
55 define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
56 ; CHECK-LABEL: combine_16_byte_loads
59 ; SANDYB-NEXT: vinsertf128
68 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
69 %v1 = load <4 x float>* %ptr, align 1
70 %v2 = load <4 x float>* %ptr2, align 1
71 %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
72 %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
76 ; Swap the operands of the shufflevector and vinsertf128 to ensure that the
77 ; pattern still matches.
78 define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
79 ; CHECK-LABEL: combine_16_byte_loads_swap
82 ; SANDYB-NEXT: vinsertf128
91 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
92 %v1 = load <4 x float>* %ptr, align 1
93 %v2 = load <4 x float>* %ptr2, align 1
94 %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
95 %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
99 ; Replace the vinsertf128 intrinsic with a shufflevector as might be
100 ; expected from auto-vectorized code.
101 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
102 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
105 ; SANDYB-NEXT: vinsertf128
114 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
115 %v1 = load <4 x float>* %ptr, align 1
116 %v2 = load <4 x float>* %ptr2, align 1
117 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
121 ; Swap the order of the shufflevector operands to ensure that the
122 ; pattern still matches.
123 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
124 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
127 ; SANDYB-NEXT: vinsertf128
136 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
137 %v1 = load <4 x float>* %ptr, align 1
138 %v2 = load <4 x float>* %ptr2, align 1
139 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
143 ; Check each element type other than float to make sure it is handled correctly.
144 ; Use the loaded values with an 'add' to make sure we're using the correct load type.
145 ; Even though BtVer2 has fast 32-byte loads, we should not generate those for
146 ; 256-bit integer vectors because BtVer2 doesn't have AVX2.
148 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
149 ; CHECK-LABEL: combine_16_byte_loads_i64
151 ; SANDYB: vextractf128
152 ; SANDYB-NEXT: vpaddq
153 ; SANDYB-NEXT: vpaddq
154 ; SANDYB-NEXT: vinsertf128
157 ; BTVER2: vextractf128
158 ; BTVER2-NEXT: vpaddq
159 ; BTVER2-NEXT: vpaddq
160 ; BTVER2-NEXT: vinsertf128
164 ; HASWELL-NEXT: vpaddq
167 %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1
168 %v1 = load <2 x i64>* %ptr, align 1
169 %v2 = load <2 x i64>* %ptr2, align 1
170 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
171 %v4 = add <4 x i64> %v3, %x
175 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
176 ; CHECK-LABEL: combine_16_byte_loads_i32
178 ; SANDYB: vextractf128
179 ; SANDYB-NEXT: vpaddd
180 ; SANDYB-NEXT: vpaddd
181 ; SANDYB-NEXT: vinsertf128
184 ; BTVER2: vextractf128
185 ; BTVER2-NEXT: vpaddd
186 ; BTVER2-NEXT: vpaddd
187 ; BTVER2-NEXT: vinsertf128
191 ; HASWELL-NEXT: vpaddd
194 %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1
195 %v1 = load <4 x i32>* %ptr, align 1
196 %v2 = load <4 x i32>* %ptr2, align 1
197 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
198 %v4 = add <8 x i32> %v3, %x
202 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
203 ; CHECK-LABEL: combine_16_byte_loads_i16
205 ; SANDYB: vextractf128
206 ; SANDYB-NEXT: vpaddw
207 ; SANDYB-NEXT: vpaddw
208 ; SANDYB-NEXT: vinsertf128
211 ; BTVER2: vextractf128
212 ; BTVER2-NEXT: vpaddw
213 ; BTVER2-NEXT: vpaddw
214 ; BTVER2-NEXT: vinsertf128
218 ; HASWELL-NEXT: vpaddw
221 %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1
222 %v1 = load <8 x i16>* %ptr, align 1
223 %v2 = load <8 x i16>* %ptr2, align 1
224 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
225 %v4 = add <16 x i16> %v3, %x
229 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
230 ; CHECK-LABEL: combine_16_byte_loads_i8
232 ; SANDYB: vextractf128
233 ; SANDYB-NEXT: vpaddb
234 ; SANDYB-NEXT: vpaddb
235 ; SANDYB-NEXT: vinsertf128
238 ; BTVER2: vextractf128
239 ; BTVER2-NEXT: vpaddb
240 ; BTVER2-NEXT: vpaddb
241 ; BTVER2-NEXT: vinsertf128
245 ; HASWELL-NEXT: vpaddb
248 %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1
249 %v1 = load <16 x i8>* %ptr, align 1
250 %v2 = load <16 x i8>* %ptr2, align 1
251 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
252 %v4 = add <32 x i8> %v3, %x
256 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
257 ; CHECK-LABEL: combine_16_byte_loads_double
260 ; SANDYB-NEXT: vinsertf128
261 ; SANDYB-NEXT: vaddpd
265 ; BTVER2-NEXT: vaddpd
272 %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1
273 %v1 = load <2 x double>* %ptr, align 1
274 %v2 = load <2 x double>* %ptr2, align 1
275 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
276 %v4 = fadd <4 x double> %v3, %x