[PowerPC] Include the permutation cost for unaligned vector loads

author Hal Finkel <hfinkel@anl.gov>

Thu, 3 Sep 2015 21:23:18 +0000 (21:23 +0000)

committer Hal Finkel <hfinkel@anl.gov>

Thu, 3 Sep 2015 21:23:18 +0000 (21:23 +0000)
author Hal Finkel <hfinkel@anl.gov>
Thu, 3 Sep 2015 21:23:18 +0000 (21:23 +0000)
committer Hal Finkel <hfinkel@anl.gov>
Thu, 3 Sep 2015 21:23:18 +0000 (21:23 +0000)
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

index 47469da3f6ccea24f5527f66d1a0b2044df1bb47..937d9c6810d7612ae865da36174cf5b4eda6085d 100644 (file)
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -333,6 +333,18 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    bool IsQPXType = ST->hasQPX() &&
                     (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
  
+  // If we can use the permutation-based load sequence, then this is also
+  // relatively cheap (not counting loop-invariant instructions): one load plus
+  // one permute (the last load in a series has extra cost, but we're
+  // neglecting that here). Note that on the P7, we should do unaligned loads
+  // for Altivec types using the VSX instructions, but that's more expensive
+  // than using the permutation-based load sequence. On the P8, that's no
+  // longer true.
+  if (Opcode == Instruction::Load &&
+      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+      Alignment >= LT.second.getScalarType().getStoreSize())
+    return Cost + LT.first; // Add the cost of the permutations.
+
    // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
    // P7, unaligned vector loads are more expensive than the permutation-based
    // load sequence, so that might be used instead, but regardless, the net cost
@@ -340,14 +352,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    if (IsVSXType || (ST->hasVSX() && IsAltivecType))
      return Cost;
  
-  // If we can use the permutation-based load sequence, then this is also
-  // relatively cheap (not counting loop-invariant instructions).
-  bool PermutationLoad = Opcode == Instruction::Load &&
-                         (IsAltivecType || IsQPXType) &&
-                         Alignment >= LT.second.getScalarType().getStoreSize();
-  if (PermutationLoad)
-    return Cost;
-
    // PPC in general does not support unaligned loads and stores. They'll need
    // to be decomposed based on the alignment factor.
  
diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll

index 9501deedeaa9efe6c51820ea27d99e3c7df74ebc..0a568b88e7269c6bfa6040d222544ca6f4f81ebd 100644 (file)
--- a/test/Analysis/CostModel/PowerPC/load_store.ll
+++ b/test/Analysis/CostModel/PowerPC/load_store.ll
@@ -34,7 +34,7 @@ define i32 @loads(i32 %arg) {
    ; CHECK: cost of 48 {{.*}} load
    load <4 x i16>, <4 x i16>* undef, align 2
  
-  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK: cost of 2 {{.*}} load
    load <4 x i32>, <4 x i32>* undef, align 4
  
    ; CHECK: cost of 46 {{.*}} load
diff --git a/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll b/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll

index b983d84ce729ad9afeec42dfcf72b9a318791c4a..3b1bc3b3fdbc098025278eb666361adf445e762c 100644 (file)
--- a/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
+++ b/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
@@ -8,7 +8,7 @@ entry:
    ret <16 x i8> %r
  
  ; CHECK-LABEL: test_l_v16i8
-; CHECK: cost of 1 for instruction:   %r = load <16 x i8>, <16 x i8>* %p, align 1
+; CHECK: cost of 2 for instruction:   %r = load <16 x i8>, <16 x i8>* %p, align 1
  }
  
  define <32 x i8> @test_l_v32i8(<32 x i8>* %p) #0 {
@@ -17,7 +17,7 @@ entry:
    ret <32 x i8> %r
  
  ; CHECK-LABEL: test_l_v32i8
-; CHECK: cost of 2 for instruction:   %r = load <32 x i8>, <32 x i8>* %p, align 1
+; CHECK: cost of 4 for instruction:   %r = load <32 x i8>, <32 x i8>* %p, align 1
  }
  
  define <8 x i16> @test_l_v8i16(<8 x i16>* %p) #0 {
@@ -26,7 +26,7 @@ entry:
    ret <8 x i16> %r
  
  ; CHECK-LABEL: test_l_v8i16
-; CHECK: cost of 1 for instruction:   %r = load <8 x i16>, <8 x i16>* %p, align 2
+; CHECK: cost of 2 for instruction:   %r = load <8 x i16>, <8 x i16>* %p, align 2
  }
  
  define <16 x i16> @test_l_v16i16(<16 x i16>* %p) #0 {
@@ -35,7 +35,7 @@ entry:
    ret <16 x i16> %r
  
  ; CHECK-LABEL: test_l_v16i16
-; CHECK: cost of 2 for instruction:   %r = load <16 x i16>, <16 x i16>* %p, align 2
+; CHECK: cost of 4 for instruction:   %r = load <16 x i16>, <16 x i16>* %p, align 2
  }
  
  define <4 x i32> @test_l_v4i32(<4 x i32>* %p) #0 {
@@ -44,7 +44,7 @@ entry:
    ret <4 x i32> %r
  
  ; CHECK-LABEL: test_l_v4i32
-; CHECK: cost of 1 for instruction:   %r = load <4 x i32>, <4 x i32>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x i32>, <4 x i32>* %p, align 4
  }
  
  define <8 x i32> @test_l_v8i32(<8 x i32>* %p) #0 {
@@ -53,7 +53,7 @@ entry:
    ret <8 x i32> %r
  
  ; CHECK-LABEL: test_l_v8i32
-; CHECK: cost of 2 for instruction:   %r = load <8 x i32>, <8 x i32>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x i32>, <8 x i32>* %p, align 4
  }
  
  define <2 x i64> @test_l_v2i64(<2 x i64>* %p) #0 {
@@ -80,7 +80,7 @@ entry:
    ret <4 x float> %r
  
  ; CHECK-LABEL: test_l_v4float
-; CHECK: cost of 1 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
  }
  
  define <8 x float> @test_l_v8float(<8 x float>* %p) #0 {
@@ -89,7 +89,7 @@ entry:
    ret <8 x float> %r
  
  ; CHECK-LABEL: test_l_v8float
-; CHECK: cost of 2 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
  }
  
  define <2 x double> @test_l_v2double(<2 x double>* %p) #0 {
@@ -224,7 +224,7 @@ entry:
    ret <4 x float> %r
  
  ; CHECK-LABEL: test_l_qv4float
-; CHECK: cost of 1 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
  }
  
  define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 {
@@ -233,7 +233,7 @@ entry:
    ret <8 x float> %r
  
  ; CHECK-LABEL: test_l_qv8float
-; CHECK: cost of 2 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
  }
  
  define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 {
@@ -242,7 +242,7 @@ entry:
    ret <4 x double> %r
  
  ; CHECK-LABEL: test_l_qv4double
-; CHECK: cost of 1 for instruction:   %r = load <4 x double>, <4 x double>* %p, align 8
+; CHECK: cost of 2 for instruction:   %r = load <4 x double>, <4 x double>* %p, align 8
  }
  
  define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 {
@@ -251,7 +251,7 @@ entry:
    ret <8 x double> %r
  
  ; CHECK-LABEL: test_l_qv8double
-; CHECK: cost of 2 for instruction:   %r = load <8 x double>, <8 x double>* %p, align 8
+; CHECK: cost of 4 for instruction:   %r = load <8 x double>, <8 x double>* %p, align 8
  }
  
  define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
author	Hal Finkel <hfinkel@anl.gov>
	Thu, 3 Sep 2015 21:23:18 +0000 (21:23 +0000)
committer	Hal Finkel <hfinkel@anl.gov>
	Thu, 3 Sep 2015 21:23:18 +0000 (21:23 +0000)
lib/Target/PowerPC/PPCTargetTransformInfo.cpp		patch \| blob \| history
test/Analysis/CostModel/PowerPC/load_store.ll		patch \| blob \| history
test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll		patch \| blob \| history