From 1c8b4f079651c15f5a54da9f545d87a17026f1fb Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Thu, 19 Nov 2015 13:13:00 +0000 Subject: [PATCH] AVX-512: Fixed COPY_TO_REGCLASS for mask registers Copying one mask register to another under BW should be done with kmovq instruction, otherwise we can loose some bits. Copying 8 bits under DQ may be done with kmovb. Differential Revision: http://reviews.llvm.org/D14812 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253563 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 62 +++++++++++++++---- .../X86/avx512-gather-scatter-intrin.ll | 20 +++--- test/CodeGen/X86/avx512dq-intrinsics.ll | 4 +- test/CodeGen/X86/avx512dqvl-intrinsics.ll | 2 +- 4 files changed, 62 insertions(+), 26 deletions(-) diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 81002492543..9cd1cd9a0fb 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -4267,15 +4267,58 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } -inline static bool MaskRegClassContains(unsigned Reg) { +static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } + +static bool GRRegClassContains(unsigned Reg) { + return X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg) || + X86::GR16RegClass.contains(Reg) || + X86::GR8RegClass.contains(Reg); +} +static +unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, MVT::i32); + return X86::KMOVBrk; + } + if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); + return X86::KMOVBkr; + } + return 0; +} + +static +unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) + return X86::KMOVQkk; + if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) + return X86::KMOVDrk; + if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) + return X86::KMOVQrk; + if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + return X86::KMOVDkr; + if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) + return X86::KMOVQkr; + return 0; +} + static -unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, + const X86Subtarget &Subtarget) +{ + if (Subtarget.hasDQI()) + if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) + return Opc; + if (Subtarget.hasBWI()) + if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) + return Opc; if (X86::VR128XRegClass.contains(DestReg, SrcReg) || X86::VR256XRegClass.contains(DestReg, SrcReg) || X86::VR512RegClass.contains(DestReg, SrcReg)) { @@ -4283,20 +4326,13 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } - if (MaskRegClassContains(DestReg) && - MaskRegClassContains(SrcReg)) + if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) return X86::KMOVWkk; - if (MaskRegClassContains(DestReg) && - (X86::GR32RegClass.contains(SrcReg) || - X86::GR16RegClass.contains(SrcReg) || - X86::GR8RegClass.contains(SrcReg))) { + if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); return X86::KMOVWkr; } - if ((X86::GR32RegClass.contains(DestReg) || - X86::GR16RegClass.contains(DestReg) || - X86::GR8RegClass.contains(DestReg)) && - MaskRegClassContains(SrcReg)) { + if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { DestReg = getX86SubSuperRegister(DestReg, MVT::i32); return X86::KMOVWrk; } @@ -4332,7 +4368,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; else if (HasAVX512) - Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 89d1bce12f8..b9594e1ac94 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -14,7 +14,7 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* ; CHECK-LABEL: gather_mask_dps: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} @@ -29,7 +29,7 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b ; CHECK-LABEL: gather_mask_dpd: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} @@ -44,7 +44,7 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba ; CHECK-LABEL: gather_mask_qps: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} @@ -59,7 +59,7 @@ define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %b ; CHECK-LABEL: gather_mask_qpd: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} @@ -86,7 +86,7 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba ; CHECK-LABEL: gather_mask_dd: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} @@ -101,7 +101,7 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, ; CHECK-LABEL: gather_mask_qd: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} @@ -116,7 +116,7 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, ; CHECK-LABEL: gather_mask_qq: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} @@ -131,7 +131,7 @@ define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, ; CHECK-LABEL: gather_mask_dq: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} @@ -400,7 +400,7 @@ define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm2 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 @@ -538,7 +538,7 @@ define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm2 -; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1} ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index c25073aaad4..137b0e49fb1 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -322,7 +322,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 @@ -342,7 +342,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, < define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index 598348fe6c5..b587e93f80a 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -1654,7 +1654,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 -- 2.34.1