Properly qualify AVX2 specific parts of execution dependency table. Also enable conve...

author Craig Topper <craig.topper@gmail.com>

Tue, 15 Nov 2011 05:55:35 +0000 (05:55 +0000)

committer Craig Topper <craig.topper@gmail.com>

Tue, 15 Nov 2011 05:55:35 +0000 (05:55 +0000)
author Craig Topper <craig.topper@gmail.com>
Tue, 15 Nov 2011 05:55:35 +0000 (05:55 +0000)
committer Craig Topper <craig.topper@gmail.com>
Tue, 15 Nov 2011 05:55:35 +0000 (05:55 +0000)
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 9428fffae884c4d6659010e36911c30453fe4bfc..9f7b21f652917ba7f11442b7d58380745127e87d 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -3555,7 +3555,11 @@ static const unsigned ReplaceableInstrs[][3] = {
    { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
    { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
    { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
-  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr },
+  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr }
+};
+
+static const unsigned ReplaceableInstrsAVX2[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
    { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
    { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
    { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
@@ -3563,31 +3567,37 @@ static const unsigned ReplaceableInstrs[][3] = {
    { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
    { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
    { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
-  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
+  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    }
  };
  
  // FIXME: Some shuffle and unpack instructions have equivalents in different
  // domains, but they require a bit more work than just switching opcodes.
  
-static const unsigned *lookup(unsigned opcode, unsigned domain) {
+static const unsigned *lookup(unsigned opcode, unsigned domain, bool hasAVX2) {
    for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
      if (ReplaceableInstrs[i][domain-1] == opcode)
        return ReplaceableInstrs[i];
+  if (domain != 3 || hasAVX2) // only use PackedInt domain if AVX2 is enabled
+    for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
+      if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
+        return ReplaceableInstrsAVX2[i];
    return 0;
  }
  
  std::pair<uint16_t, uint16_t>
  X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
    uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
    return std::make_pair(domain,
-                        domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
+                  domain && lookup(MI->getOpcode(), domain, hasAVX2) ? 0xe : 0);
  }
  
  void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
    assert(Domain>0 && Domain<4 && "Invalid execution domain");
    uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
    assert(dom && "Not an SSE instruction");
-  const unsigned *table = lookup(MI->getOpcode(), dom);
+  bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
+  const unsigned *table = lookup(MI->getOpcode(), dom, hasAVX2);
    assert(table && "Cannot change domain");
    MI->setDesc(get(table[Domain-1]));
  }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp

index 4d4d7c06ab9abe38fda1668284f2742f8e158a6d..feb71555c4661766edeb47283cb0ceabf6e9ab66 100644 (file)
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -140,10 +140,7 @@ bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
        PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
        ShouldPrint = true;
      }
-    if (Subtarget.hasAVX2()) {
-      // FIXME this should be turned on for just AVX, but the pass doesn't check
-      // that instructions are valid before replacing them and there are AVX2
-      // integer instructions in the table.
+    if (Subtarget.hasAVX()) {
        PM.add(createExecutionDependencyFixPass(&X86::VR256RegClass));
        ShouldPrint = true;
      }
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll

index 3fa1d95bf2f88721cb81d03e472dd4a8583d4c6e..df12b71dba5d8df1bf214a6daba379ed4bfdde6e 100644 (file)
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2021,7 +2021,9 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
  
  define <32 x i8> @test_x86_avx_loadu_dq_256(i8* %a0) {
    ; CHECK: vmovdqu
-  %res = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
+  %a1 = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
+  ; add operation forces the execution domain.
+  %res = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    ret <32 x i8> %res
  }
  declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly
@@ -2029,7 +2031,9 @@ declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly
  
  define <4 x double> @test_x86_avx_loadu_pd_256(i8* %a0) {
    ; CHECK: vmovupd
-  %res = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
+  %a1 = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
+  ; add operation forces the execution domain.
+  %res = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
    ret <4 x double> %res
  }
  declare <4 x double> @llvm.x86.avx.loadu.pd.256(i8*) nounwind readonly
@@ -2157,7 +2161,9 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
  
  define void @test_x86_avx_movnt_dq_256(i8* %a0, <4 x i64> %a1) {
    ; CHECK: vmovntdq
-  call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a1)
+  ; add operation forces the execution domain.
+  %a2 = add <4 x i64> %a1, <i64 1, i64 1, i64 1, i64 1>
+  call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a2)
    ret void
  }
  declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
@@ -2165,7 +2171,8 @@ declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
  
  define void @test_x86_avx_movnt_pd_256(i8* %a0, <4 x double> %a1) {
    ; CHECK: vmovntpd
-  call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a1)
+  %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a2)
    ret void
  }
  declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
@@ -2258,7 +2265,9 @@ declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
  
  define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
    ; CHECK: vmovdqu
-  call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a1)
+  ; add operation forces the execution domain.
+  %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
    ret void
  }
  declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
@@ -2266,7 +2275,9 @@ declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
  
  define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
    ; CHECK: vmovupd
-  call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a1)
+  ; add operation forces the execution domain.
+  %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
    ret void
  }
  declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll

index cd371355828de9c33dbe9c5c2e9fea52d4938b1f..115cefb1b5ebe2b5d2968bd169889807d92a165b 100644 (file)
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -7,7 +7,9 @@ entry:
    %1 = bitcast <4 x double> %y to <4 x i64>
    %and.i = and <4 x i64> %0, %1
    %2 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
  }
  
  ; CHECK: vandpd LCP{{.*}}(%rip)
@@ -16,7 +18,9 @@ entry:
    %0 = bitcast <4 x double> %y to <4 x i64>
    %and.i = and <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
    %1 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %1
+  ; add forces execution domain
+  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %2
  }
  
  ; CHECK: vandps
@@ -45,7 +49,9 @@ entry:
    %1 = bitcast <4 x double> %y to <4 x i64>
    %xor.i = xor <4 x i64> %0, %1
    %2 = bitcast <4 x i64> %xor.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
  }
  
  ; CHECK: vxorpd LCP{{.*}}(%rip)
@@ -54,7 +60,9 @@ entry:
    %0 = bitcast <4 x double> %y to <4 x i64>
    %xor.i = xor <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
    %1 = bitcast <4 x i64> %xor.i to <4 x double>
-  ret <4 x double> %1
+  ; add forces execution domain
+  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %2
  }
  
  ; CHECK: vxorps
@@ -83,7 +91,9 @@ entry:
    %1 = bitcast <4 x double> %y to <4 x i64>
    %or.i = or <4 x i64> %0, %1
    %2 = bitcast <4 x i64> %or.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
  }
  
  ; CHECK: vorpd LCP{{.*}}(%rip)
@@ -92,7 +102,9 @@ entry:
    %0 = bitcast <4 x double> %y to <4 x i64>
    %or.i = or <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
    %1 = bitcast <4 x i64> %or.i to <4 x double>
-  ret <4 x double> %1
+  ; add forces execution domain
+  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %2
  }
  
  ; CHECK: vorps
@@ -122,7 +134,9 @@ entry:
    %1 = bitcast <4 x double> %y to <4 x i64>
    %and.i = and <4 x i64> %1, %neg.i
    %2 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
  }
  
  ; CHECK: vandnpd (%
@@ -134,7 +148,9 @@ entry:
    %1 = bitcast <4 x double> %tmp2 to <4 x i64>
    %and.i = and <4 x i64> %1, %neg.i
    %2 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
  }
  
  ; CHECK: vandnps
author	Craig Topper <craig.topper@gmail.com>
	Tue, 15 Nov 2011 05:55:35 +0000 (05:55 +0000)
committer	Craig Topper <craig.topper@gmail.com>
	Tue, 15 Nov 2011 05:55:35 +0000 (05:55 +0000)
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetMachine.cpp		patch \| blob \| history
test/CodeGen/X86/avx-intrinsics-x86.ll		patch \| blob \| history
test/CodeGen/X86/avx-logic.ll		patch \| blob \| history