From 4c077a1f04c97210793d62debef250b974d168bc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 15 Nov 2011 05:55:35 +0000 Subject: [PATCH] Properly qualify AVX2 specific parts of execution dependency table. Also enable converting between 256-bit PS/PD operations when AVX1 is enabled. Fixes PR11370. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144622 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 20 ++++++++++++---- lib/Target/X86/X86TargetMachine.cpp | 5 +--- test/CodeGen/X86/avx-intrinsics-x86.ll | 23 +++++++++++++----- test/CodeGen/X86/avx-logic.ll | 32 +++++++++++++++++++------- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 9428fffae88..9f7b21f6529 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3555,7 +3555,11 @@ static const unsigned ReplaceableInstrs[][3] = { { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, - { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } +}; + +static const unsigned ReplaceableInstrsAVX2[][3] = { + //PackedSingle PackedDouble PackedInt { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm }, { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr }, { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm }, @@ -3563,31 +3567,37 @@ static const unsigned ReplaceableInstrs[][3] = { { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm }, { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, - { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, + { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr } }; // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. -static const unsigned *lookup(unsigned opcode, unsigned domain) { +static const unsigned *lookup(unsigned opcode, unsigned domain, bool hasAVX2) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) if (ReplaceableInstrs[i][domain-1] == opcode) return ReplaceableInstrs[i]; + if (domain != 3 || hasAVX2) // only use PackedInt domain if AVX2 is enabled + for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) + if (ReplaceableInstrsAVX2[i][domain-1] == opcode) + return ReplaceableInstrsAVX2[i]; return 0; } std::pair X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + bool hasAVX2 = TM.getSubtarget().hasAVX2(); return std::make_pair(domain, - domain && lookup(MI->getOpcode(), domain) ? 0xe : 0); + domain && lookup(MI->getOpcode(), domain, hasAVX2) ? 0xe : 0); } void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); - const unsigned *table = lookup(MI->getOpcode(), dom); + bool hasAVX2 = TM.getSubtarget().hasAVX2(); + const unsigned *table = lookup(MI->getOpcode(), dom, hasAVX2); assert(table && "Cannot change domain"); MI->setDesc(get(table[Domain-1])); } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 4d4d7c06ab9..feb71555c46 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -140,10 +140,7 @@ bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM, PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass)); ShouldPrint = true; } - if (Subtarget.hasAVX2()) { - // FIXME this should be turned on for just AVX, but the pass doesn't check - // that instructions are valid before replacing them and there are AVX2 - // integer instructions in the table. + if (Subtarget.hasAVX()) { PM.add(createExecutionDependencyFixPass(&X86::VR256RegClass)); ShouldPrint = true; } diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 3fa1d95bf2f..df12b71dba5 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2021,7 +2021,9 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly define <32 x i8> @test_x86_avx_loadu_dq_256(i8* %a0) { ; CHECK: vmovdqu - %res = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1] + %a1 = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1] + ; add operation forces the execution domain. + %res = add <32 x i8> %a1, ret <32 x i8> %res } declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly @@ -2029,7 +2031,9 @@ declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly define <4 x double> @test_x86_avx_loadu_pd_256(i8* %a0) { ; CHECK: vmovupd - %res = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1] + %a1 = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1] + ; add operation forces the execution domain. + %res = fadd <4 x double> %a1, ret <4 x double> %res } declare <4 x double> @llvm.x86.avx.loadu.pd.256(i8*) nounwind readonly @@ -2157,7 +2161,9 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone define void @test_x86_avx_movnt_dq_256(i8* %a0, <4 x i64> %a1) { ; CHECK: vmovntdq - call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a1) + ; add operation forces the execution domain. + %a2 = add <4 x i64> %a1, + call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a2) ret void } declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind @@ -2165,7 +2171,8 @@ declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind define void @test_x86_avx_movnt_pd_256(i8* %a0, <4 x double> %a1) { ; CHECK: vmovntpd - call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a1) + %a2 = fadd <4 x double> %a1, + call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a2) ret void } declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind @@ -2258,7 +2265,9 @@ declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { ; CHECK: vmovdqu - call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a1) + ; add operation forces the execution domain. + %a2 = add <32 x i8> %a1, + call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2) ret void } declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind @@ -2266,7 +2275,9 @@ declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) { ; CHECK: vmovupd - call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a1) + ; add operation forces the execution domain. + %a2 = fadd <4 x double> %a1, + call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2) ret void } declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll index cd371355828..115cefb1b5e 100644 --- a/test/CodeGen/X86/avx-logic.ll +++ b/test/CodeGen/X86/avx-logic.ll @@ -7,7 +7,9 @@ entry: %1 = bitcast <4 x double> %y to <4 x i64> %and.i = and <4 x i64> %0, %1 %2 = bitcast <4 x i64> %and.i to <4 x double> - ret <4 x double> %2 + ; add forces execution domain + %3 = fadd <4 x double> %2, + ret <4 x double> %3 } ; CHECK: vandpd LCP{{.*}}(%rip) @@ -16,7 +18,9 @@ entry: %0 = bitcast <4 x double> %y to <4 x i64> %and.i = and <4 x i64> %0, %1 = bitcast <4 x i64> %and.i to <4 x double> - ret <4 x double> %1 + ; add forces execution domain + %2 = fadd <4 x double> %1, + ret <4 x double> %2 } ; CHECK: vandps @@ -45,7 +49,9 @@ entry: %1 = bitcast <4 x double> %y to <4 x i64> %xor.i = xor <4 x i64> %0, %1 %2 = bitcast <4 x i64> %xor.i to <4 x double> - ret <4 x double> %2 + ; add forces execution domain + %3 = fadd <4 x double> %2, + ret <4 x double> %3 } ; CHECK: vxorpd LCP{{.*}}(%rip) @@ -54,7 +60,9 @@ entry: %0 = bitcast <4 x double> %y to <4 x i64> %xor.i = xor <4 x i64> %0, %1 = bitcast <4 x i64> %xor.i to <4 x double> - ret <4 x double> %1 + ; add forces execution domain + %2 = fadd <4 x double> %1, + ret <4 x double> %2 } ; CHECK: vxorps @@ -83,7 +91,9 @@ entry: %1 = bitcast <4 x double> %y to <4 x i64> %or.i = or <4 x i64> %0, %1 %2 = bitcast <4 x i64> %or.i to <4 x double> - ret <4 x double> %2 + ; add forces execution domain + %3 = fadd <4 x double> %2, + ret <4 x double> %3 } ; CHECK: vorpd LCP{{.*}}(%rip) @@ -92,7 +102,9 @@ entry: %0 = bitcast <4 x double> %y to <4 x i64> %or.i = or <4 x i64> %0, %1 = bitcast <4 x i64> %or.i to <4 x double> - ret <4 x double> %1 + ; add forces execution domain + %2 = fadd <4 x double> %1, + ret <4 x double> %2 } ; CHECK: vorps @@ -122,7 +134,9 @@ entry: %1 = bitcast <4 x double> %y to <4 x i64> %and.i = and <4 x i64> %1, %neg.i %2 = bitcast <4 x i64> %and.i to <4 x double> - ret <4 x double> %2 + ; add forces execution domain + %3 = fadd <4 x double> %2, + ret <4 x double> %3 } ; CHECK: vandnpd (% @@ -134,7 +148,9 @@ entry: %1 = bitcast <4 x double> %tmp2 to <4 x i64> %and.i = and <4 x i64> %1, %neg.i %2 = bitcast <4 x i64> %and.i to <4 x double> - ret <4 x double> %2 + ; add forces execution domain + %3 = fadd <4 x double> %2, + ret <4 x double> %3 } ; CHECK: vandnps -- 2.34.1