From 232202439abcda39a8c2cd8f897a09c245fbe7dc Mon Sep 17 00:00:00 2001 From: Robert Khasanov Date: Wed, 13 Aug 2014 10:46:00 +0000 Subject: [PATCH] [SKX] Extended non-temporal load/store instructions for AVX512VL subsets. Added avx512_movnt_vl multiclass for handling 256/128-bit forms of instruction. Added encoding and lowering tests. Reviewed by Elena Demikhovsky git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@215536 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 4 +- lib/Target/X86/X86InstrAVX512.td | 98 ++++++++---- lib/Target/X86/X86InstrInfo.td | 1 + lib/Target/X86/X86InstrSSE.td | 2 + test/CodeGen/X86/avx512vl-nontemporal.ll | 34 ++++ test/MC/X86/avx512-encodings.s | 96 ++++++++++++ test/MC/X86/x86-64-avx512f_vl.s | 192 +++++++++++++++++++++++ 7 files changed, 391 insertions(+), 36 deletions(-) create mode 100644 test/CodeGen/X86/avx512vl-nontemporal.ll diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 5f8ae27839c..1f5ad9d1523 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1954,8 +1954,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_i32_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">, Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>; - def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">, - Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>; } //===----------------------------------------------------------------------===// @@ -3219,6 +3217,8 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">, + Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 02be95f48da..3fe62d60d3c 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2090,43 +2090,73 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// +let SchedRW = [WriteLoad] in { + def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))], + SSEPackedInt>, EVEX, T8PD, EVEX_V512, + EVEX_CD8<64, CD8VF>; + + let Predicates = [HasAVX512, HasVLX] in { + def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), + (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V256, + EVEX_CD8<64, CD8VF>; + + def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V128, + EVEX_CD8<64, CD8VF>; + } +} -def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst), - (ins i512mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR512:$dst, - (int_x86_avx512_movntdqa addr:$src))]>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; - -// Prefer non-temporal over temporal versions -let AddedComplexity = 400, SchedRW = [WriteStore] in { - -def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v16f32 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; - -def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - - -def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs), - (ins i512mem:$dst, VR512:$src), - "vmovntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8i64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; +multiclass avx512_movnt opc, string OpcodeStr, PatFrag st_frag, + ValueType OpVT, RegisterClass RC, X86MemOperand memop, + Domain d, InstrItinClass itin = IIC_SSE_MOVNT> { + let SchedRW = [WriteStore], mayStore = 1, + AddedComplexity = 400 in + def mr : AVX512PI, EVEX; } +multiclass avx512_movnt_vl opc, string OpcodeStr, PatFrag st_frag, + string elty, string elsz, string vsz512, + string vsz256, string vsz128, Domain d, + Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> { + let Predicates = [prd] in + defm Z : avx512_movnt("v"##vsz512##elty##elsz), VR512, + !cast(elty##"512mem"), d, itin>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_movnt("v"##vsz256##elty##elsz), VR256X, + !cast(elty##"256mem"), d, itin>, + EVEX_V256; + + defm Z128 : avx512_movnt("v"##vsz128##elty##elsz), VR128X, + !cast(elty##"128mem"), d, itin>, + EVEX_V128; + } +} + +defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore, + "i", "64", "8", "4", "2", SSEPackedInt, + HasAVX512>, PD, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore, + "f", "64", "8", "4", "2", SSEPackedDouble, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore, + "f", "32", "16", "8", "4", SSEPackedSingle, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 - Integer arithmetic // diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index f2f53f32120..b262ec22f94 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -727,6 +727,7 @@ def HasDQI : Predicate<"Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">; def HasVLX : Predicate<"Subtarget->hasVLX()">, AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">; +def NoVLX : Predicate<"!Subtarget->hasVLX()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ea8b6c7cf43..2189d14c07b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3697,6 +3697,7 @@ let Predicates = [UseSSE1] in { let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteStore] in { +let Predicates = [HasAVX, NoVLX] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3737,6 +3738,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L; +} def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll new file mode 100644 index 00000000000..2ad9768a095 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-nontemporal.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s + +define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) { +; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62 + %cast = bitcast i8* %B to <8 x float>* + %A2 = fadd <8 x float> %A, %AA + store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0 +; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62 + %cast1 = bitcast i8* %B to <4 x i64>* + %E2 = add <4 x i64> %E, %EE + store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0 +; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62 + %cast2 = bitcast i8* %B to <4 x double>* + %C2 = fadd <4 x double> %C, %CC + store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0 + ret void +} + +define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) { +; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62 + %cast = bitcast i8* %B to <4 x float>* + %A2 = fadd <4 x float> %A, %AA + store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0 +; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62 + %cast1 = bitcast i8* %B to <2 x i64>* + %E2 = add <2 x i64> %E, %EE + store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0 +; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62 + %cast2 = bitcast i8* %B to <2 x double>* + %C2 = fadd <2 x double> %C, %CC + store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0 + ret void +} +!0 = metadata !{i32 1} diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index a1a94fbaa54..8cd7aa41400 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -665,6 +665,102 @@ // CHECK: encoding: [0x62,0xf1,0xfe,0x48,0x6f,0xb2,0xc0,0xdf,0xff,0xff] vmovdqu64 -8256(%rdx), %zmm6 +// CHECK: vmovntdq %zmm24, (%rcx) +// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x01] + vmovntdq %zmm24, (%rcx) + +// CHECK: vmovntdq %zmm24, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x21,0x7d,0x48,0xe7,0x84,0xf0,0x23,0x01,0x00,0x00] + vmovntdq %zmm24, 291(%rax,%r14,8) + +// CHECK: vmovntdq %zmm24, 8128(%rdx) +// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x7f] + vmovntdq %zmm24, 8128(%rdx) + +// CHECK: vmovntdq %zmm24, 8192(%rdx) +// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0x00,0x20,0x00,0x00] + vmovntdq %zmm24, 8192(%rdx) + +// CHECK: vmovntdq %zmm24, -8192(%rdx) +// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x80] + vmovntdq %zmm24, -8192(%rdx) + +// CHECK: vmovntdq %zmm24, -8256(%rdx) +// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0xc0,0xdf,0xff,0xff] + vmovntdq %zmm24, -8256(%rdx) + +// CHECK: vmovntdqa (%rcx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x09] + vmovntdqa (%rcx), %zmm17 + +// CHECK: vmovntdqa 291(%rax,%r14,8), %zmm17 +// CHECK: encoding: [0x62,0xa2,0x7d,0x48,0x2a,0x8c,0xf0,0x23,0x01,0x00,0x00] + vmovntdqa 291(%rax,%r14,8), %zmm17 + +// CHECK: vmovntdqa 8128(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x7f] + vmovntdqa 8128(%rdx), %zmm17 + +// CHECK: vmovntdqa 8192(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0x00,0x20,0x00,0x00] + vmovntdqa 8192(%rdx), %zmm17 + +// CHECK: vmovntdqa -8192(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x80] + vmovntdqa -8192(%rdx), %zmm17 + +// CHECK: vmovntdqa -8256(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0xc0,0xdf,0xff,0xff] + vmovntdqa -8256(%rdx), %zmm17 + +// CHECK: vmovntpd %zmm17, (%rcx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x09] + vmovntpd %zmm17, (%rcx) + +// CHECK: vmovntpd %zmm17, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa1,0xfd,0x48,0x2b,0x8c,0xf0,0x23,0x01,0x00,0x00] + vmovntpd %zmm17, 291(%rax,%r14,8) + +// CHECK: vmovntpd %zmm17, 8128(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x7f] + vmovntpd %zmm17, 8128(%rdx) + +// CHECK: vmovntpd %zmm17, 8192(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0x00,0x20,0x00,0x00] + vmovntpd %zmm17, 8192(%rdx) + +// CHECK: vmovntpd %zmm17, -8192(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x80] + vmovntpd %zmm17, -8192(%rdx) + +// CHECK: vmovntpd %zmm17, -8256(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0xc0,0xdf,0xff,0xff] + vmovntpd %zmm17, -8256(%rdx) + +// CHECK: vmovntps %zmm5, (%rcx) +// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x29] + vmovntps %zmm5, (%rcx) + +// CHECK: vmovntps %zmm5, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xb1,0x7c,0x48,0x2b,0xac,0xf0,0x23,0x01,0x00,0x00] + vmovntps %zmm5, 291(%rax,%r14,8) + +// CHECK: vmovntps %zmm5, 8128(%rdx) +// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x7f] + vmovntps %zmm5, 8128(%rdx) + +// CHECK: vmovntps %zmm5, 8192(%rdx) +// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0x00,0x20,0x00,0x00] + vmovntps %zmm5, 8192(%rdx) + +// CHECK: vmovntps %zmm5, -8192(%rdx) +// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x80] + vmovntps %zmm5, -8192(%rdx) + +// CHECK: vmovntps %zmm5, -8256(%rdx) +// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0xc0,0xdf,0xff,0xff] + vmovntps %zmm5, -8256(%rdx) + // CHECK: vmovupd %zmm9, %zmm27 // CHECK: encoding: [0x62,0x41,0xfd,0x48,0x10,0xd9] vmovupd %zmm9, %zmm27 diff --git a/test/MC/X86/x86-64-avx512f_vl.s b/test/MC/X86/x86-64-avx512f_vl.s index 83320dfcb37..b529c03791a 100644 --- a/test/MC/X86/x86-64-avx512f_vl.s +++ b/test/MC/X86/x86-64-avx512f_vl.s @@ -432,6 +432,198 @@ // CHECK: encoding: [0x62,0x61,0xfe,0x28,0x6f,0xaa,0xe0,0xef,0xff,0xff] vmovdqu64 -4128(%rdx), %ymm29 +// CHECK: vmovntdq %xmm22, (%rcx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x31] + vmovntdq %xmm22, (%rcx) + +// CHECK: vmovntdq %xmm22, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa1,0x7d,0x08,0xe7,0xb4,0xf0,0x23,0x01,0x00,0x00] + vmovntdq %xmm22, 291(%rax,%r14,8) + +// CHECK: vmovntdq %xmm22, 2032(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x72,0x7f] + vmovntdq %xmm22, 2032(%rdx) + +// CHECK: vmovntdq %xmm22, 2048(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0xb2,0x00,0x08,0x00,0x00] + vmovntdq %xmm22, 2048(%rdx) + +// CHECK: vmovntdq %xmm22, -2048(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x72,0x80] + vmovntdq %xmm22, -2048(%rdx) + +// CHECK: vmovntdq %xmm22, -2064(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0xb2,0xf0,0xf7,0xff,0xff] + vmovntdq %xmm22, -2064(%rdx) + +// CHECK: vmovntdq %ymm19, (%rcx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x19] + vmovntdq %ymm19, (%rcx) + +// CHECK: vmovntdq %ymm19, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa1,0x7d,0x28,0xe7,0x9c,0xf0,0x23,0x01,0x00,0x00] + vmovntdq %ymm19, 291(%rax,%r14,8) + +// CHECK: vmovntdq %ymm19, 4064(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x5a,0x7f] + vmovntdq %ymm19, 4064(%rdx) + +// CHECK: vmovntdq %ymm19, 4096(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x9a,0x00,0x10,0x00,0x00] + vmovntdq %ymm19, 4096(%rdx) + +// CHECK: vmovntdq %ymm19, -4096(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x5a,0x80] + vmovntdq %ymm19, -4096(%rdx) + +// CHECK: vmovntdq %ymm19, -4128(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x9a,0xe0,0xef,0xff,0xff] + vmovntdq %ymm19, -4128(%rdx) + +// CHECK: vmovntdqa (%rcx), %xmm24 +// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x01] + vmovntdqa (%rcx), %xmm24 + +// CHECK: vmovntdqa 291(%rax,%r14,8), %xmm24 +// CHECK: encoding: [0x62,0x22,0x7d,0x08,0x2a,0x84,0xf0,0x23,0x01,0x00,0x00] + vmovntdqa 291(%rax,%r14,8), %xmm24 + +// CHECK: vmovntdqa 2032(%rdx), %xmm24 +// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x42,0x7f] + vmovntdqa 2032(%rdx), %xmm24 + +// CHECK: vmovntdqa 2048(%rdx), %xmm24 +// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x82,0x00,0x08,0x00,0x00] + vmovntdqa 2048(%rdx), %xmm24 + +// CHECK: vmovntdqa -2048(%rdx), %xmm24 +// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x42,0x80] + vmovntdqa -2048(%rdx), %xmm24 + +// CHECK: vmovntdqa -2064(%rdx), %xmm24 +// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x82,0xf0,0xf7,0xff,0xff] + vmovntdqa -2064(%rdx), %xmm24 + +// CHECK: vmovntdqa (%rcx), %ymm28 +// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0x21] + vmovntdqa (%rcx), %ymm28 + +// CHECK: vmovntdqa 291(%rax,%r14,8), %ymm28 +// CHECK: encoding: [0x62,0x22,0x7d,0x28,0x2a,0xa4,0xf0,0x23,0x01,0x00,0x00] + vmovntdqa 291(%rax,%r14,8), %ymm28 + +// CHECK: vmovntdqa 4064(%rdx), %ymm28 +// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0x62,0x7f] + vmovntdqa 4064(%rdx), %ymm28 + +// CHECK: vmovntdqa 4096(%rdx), %ymm28 +// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0xa2,0x00,0x10,0x00,0x00] + vmovntdqa 4096(%rdx), %ymm28 + +// CHECK: vmovntdqa -4096(%rdx), %ymm28 +// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0x62,0x80] + vmovntdqa -4096(%rdx), %ymm28 + +// CHECK: vmovntdqa -4128(%rdx), %ymm28 +// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0xa2,0xe0,0xef,0xff,0xff] + vmovntdqa -4128(%rdx), %ymm28 + +// CHECK: vmovntpd %xmm17, (%rcx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x09] + vmovntpd %xmm17, (%rcx) + +// CHECK: vmovntpd %xmm17, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa1,0xfd,0x08,0x2b,0x8c,0xf0,0x23,0x01,0x00,0x00] + vmovntpd %xmm17, 291(%rax,%r14,8) + +// CHECK: vmovntpd %xmm17, 2032(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x4a,0x7f] + vmovntpd %xmm17, 2032(%rdx) + +// CHECK: vmovntpd %xmm17, 2048(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x8a,0x00,0x08,0x00,0x00] + vmovntpd %xmm17, 2048(%rdx) + +// CHECK: vmovntpd %xmm17, -2048(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x4a,0x80] + vmovntpd %xmm17, -2048(%rdx) + +// CHECK: vmovntpd %xmm17, -2064(%rdx) +// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x8a,0xf0,0xf7,0xff,0xff] + vmovntpd %xmm17, -2064(%rdx) + +// CHECK: vmovntpd %ymm27, (%rcx) +// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x19] + vmovntpd %ymm27, (%rcx) + +// CHECK: vmovntpd %ymm27, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x21,0xfd,0x28,0x2b,0x9c,0xf0,0x23,0x01,0x00,0x00] + vmovntpd %ymm27, 291(%rax,%r14,8) + +// CHECK: vmovntpd %ymm27, 4064(%rdx) +// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x5a,0x7f] + vmovntpd %ymm27, 4064(%rdx) + +// CHECK: vmovntpd %ymm27, 4096(%rdx) +// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x9a,0x00,0x10,0x00,0x00] + vmovntpd %ymm27, 4096(%rdx) + +// CHECK: vmovntpd %ymm27, -4096(%rdx) +// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x5a,0x80] + vmovntpd %ymm27, -4096(%rdx) + +// CHECK: vmovntpd %ymm27, -4128(%rdx) +// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x9a,0xe0,0xef,0xff,0xff] + vmovntpd %ymm27, -4128(%rdx) + +// CHECK: vmovntps %xmm26, (%rcx) +// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x11] + vmovntps %xmm26, (%rcx) + +// CHECK: vmovntps %xmm26, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x21,0x7c,0x08,0x2b,0x94,0xf0,0x23,0x01,0x00,0x00] + vmovntps %xmm26, 291(%rax,%r14,8) + +// CHECK: vmovntps %xmm26, 2032(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x52,0x7f] + vmovntps %xmm26, 2032(%rdx) + +// CHECK: vmovntps %xmm26, 2048(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x92,0x00,0x08,0x00,0x00] + vmovntps %xmm26, 2048(%rdx) + +// CHECK: vmovntps %xmm26, -2048(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x52,0x80] + vmovntps %xmm26, -2048(%rdx) + +// CHECK: vmovntps %xmm26, -2064(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x92,0xf0,0xf7,0xff,0xff] + vmovntps %xmm26, -2064(%rdx) + +// CHECK: vmovntps %ymm28, (%rcx) +// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0x21] + vmovntps %ymm28, (%rcx) + +// CHECK: vmovntps %ymm28, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x21,0x7c,0x28,0x2b,0xa4,0xf0,0x23,0x01,0x00,0x00] + vmovntps %ymm28, 291(%rax,%r14,8) + +// CHECK: vmovntps %ymm28, 4064(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0x62,0x7f] + vmovntps %ymm28, 4064(%rdx) + +// CHECK: vmovntps %ymm28, 4096(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0xa2,0x00,0x10,0x00,0x00] + vmovntps %ymm28, 4096(%rdx) + +// CHECK: vmovntps %ymm28, -4096(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0x62,0x80] + vmovntps %ymm28, -4096(%rdx) + +// CHECK: vmovntps %ymm28, -4128(%rdx) +// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0xa2,0xe0,0xef,0xff,0xff] + vmovntps %ymm28, -4128(%rdx) + // CHECK: vmovupd %xmm22, %xmm24 // CHECK: encoding: [0x62,0x21,0xfd,0x08,0x10,0xc6] vmovupd %xmm22, %xmm24 -- 2.34.1