1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
5 ; Make sure that we generate non-temporal stores for the test cases below.
6 ; We use xorps for zeroing, so domain information isn't available anymore.
8 define void @test_zero_v4f32(<4 x float>* %dst) {
9 ; CHECK-LABEL: test_zero_v4f32:
12 store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
16 define void @test_zero_v4i32(<4 x i32>* %dst) {
17 ; CHECK-LABEL: test_zero_v4i32:
20 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
24 define void @test_zero_v2f64(<2 x double>* %dst) {
25 ; CHECK-LABEL: test_zero_v2f64:
28 store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
32 define void @test_zero_v2i64(<2 x i64>* %dst) {
33 ; CHECK-LABEL: test_zero_v2i64:
36 store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
40 define void @test_zero_v8i16(<8 x i16>* %dst) {
41 ; CHECK-LABEL: test_zero_v8i16:
44 store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
48 define void @test_zero_v16i8(<16 x i8>* %dst) {
49 ; CHECK-LABEL: test_zero_v16i8:
52 store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
56 ; And now YMM versions.
58 define void @test_zero_v8f32(<8 x float>* %dst) {
59 ; CHECK-LABEL: test_zero_v8f32:
61 store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
65 define void @test_zero_v8i32(<8 x i32>* %dst) {
66 ; CHECK-LABEL: test_zero_v8i32:
68 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
72 define void @test_zero_v4f64(<4 x double>* %dst) {
73 ; CHECK-LABEL: test_zero_v4f64:
75 store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
79 define void @test_zero_v4i64(<4 x i64>* %dst) {
80 ; CHECK-LABEL: test_zero_v4i64:
82 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
86 define void @test_zero_v16i16(<16 x i16>* %dst) {
87 ; CHECK-LABEL: test_zero_v16i16:
89 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
93 define void @test_zero_v32i8(<32 x i8>* %dst) {
94 ; CHECK-LABEL: test_zero_v32i8:
96 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
101 ; Check that we also handle arguments. Here the type survives longer.
103 define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
104 ; CHECK-LABEL: test_arg_v4f32:
107 store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
111 define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
112 ; CHECK-LABEL: test_arg_v4i32:
115 store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
119 define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
120 ; CHECK-LABEL: test_arg_v2f64:
123 store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
127 define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
128 ; CHECK-LABEL: test_arg_v2i64:
131 store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
135 define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
136 ; CHECK-LABEL: test_arg_v8i16:
139 store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
143 define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
144 ; CHECK-LABEL: test_arg_v16i8:
147 store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
151 ; And now YMM versions.
153 define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
154 ; CHECK-LABEL: test_arg_v8f32:
156 store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
160 define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
161 ; CHECK-LABEL: test_arg_v8i32:
162 ; AVX2: vmovntps %ymm
163 store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
167 define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
168 ; CHECK-LABEL: test_arg_v4f64:
170 store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
174 define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
175 ; CHECK-LABEL: test_arg_v4i64:
176 ; AVX2: vmovntps %ymm
177 store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
181 define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
182 ; CHECK-LABEL: test_arg_v16i16:
183 ; AVX2: vmovntps %ymm
184 store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
188 define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
189 ; CHECK-LABEL: test_arg_v32i8:
190 ; AVX2: vmovntps %ymm
191 store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
196 ; Now check that if the execution domain is trivially visible, we use it.
197 ; We use an add to make the type survive all the way to the MOVNT.
199 define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
200 ; CHECK-LABEL: test_op_v4f32:
203 %r = fadd <4 x float> %a, %b
204 store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
208 define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
209 ; CHECK-LABEL: test_op_v4i32:
212 %r = add <4 x i32> %a, %b
213 store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
217 define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
218 ; CHECK-LABEL: test_op_v2f64:
221 %r = fadd <2 x double> %a, %b
222 store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
226 define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
227 ; CHECK-LABEL: test_op_v2i64:
230 %r = add <2 x i64> %a, %b
231 store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
235 define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
236 ; CHECK-LABEL: test_op_v8i16:
239 %r = add <8 x i16> %a, %b
240 store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
244 define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
245 ; CHECK-LABEL: test_op_v16i8:
248 %r = add <16 x i8> %a, %b
249 store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
253 ; And now YMM versions.
255 define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
256 ; CHECK-LABEL: test_op_v8f32:
258 %r = fadd <8 x float> %a, %b
259 store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
263 define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
264 ; CHECK-LABEL: test_op_v8i32:
265 ; AVX2: vmovntdq %ymm
266 %r = add <8 x i32> %a, %b
267 store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
271 define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
272 ; CHECK-LABEL: test_op_v4f64:
274 %r = fadd <4 x double> %a, %b
275 store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
279 define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
280 ; CHECK-LABEL: test_op_v4i64:
281 ; AVX2: vmovntdq %ymm
282 %r = add <4 x i64> %a, %b
283 store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
287 define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
288 ; CHECK-LABEL: test_op_v16i16:
289 ; AVX2: vmovntdq %ymm
290 %r = add <16 x i16> %a, %b
291 store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
295 define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
296 ; CHECK-LABEL: test_op_v32i8:
297 ; AVX2: vmovntdq %ymm
298 %r = add <32 x i8> %a, %b
299 store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
303 ; 256-bit NT stores require 256-bit alignment.
304 ; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
305 ; could even scalarize to movnti when we have 1-alignment: nontemporal is
306 ; probably always worth even some 20 instruction scalarization.
307 define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
308 ; CHECK-LABEL: test_unaligned_v8f32:
313 %r = fadd <8 x float> %a, %b
314 store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1