From ecebcfc3a1e9e115f04c2610779d6020f8e1d03b Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Thu, 3 Sep 2015 21:23:18 +0000
Subject: [PATCH] [PowerPC] Include the permutation cost for unaligned vector
 loads

Pre-P8, when we generate code for unaligned vector loads (for Altivec and QPX
types), even when accounting for the combining that takes place for multiple
consecutive such loads, there is at least one load instructions and one
permutation for each load. Make sure the cost reported reflects the cost of the
permutes as well.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246807 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 20 +++++++++-------
 test/Analysis/CostModel/PowerPC/load_store.ll |  2 +-
 .../CostModel/PowerPC/unal-vec-ldst.ll        | 24 +++++++++----------
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 47469da3f6c..937d9c6810d 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -333,6 +333,18 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   bool IsQPXType = ST->hasQPX() &&
                    (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
 
+  // If we can use the permutation-based load sequence, then this is also
+  // relatively cheap (not counting loop-invariant instructions): one load plus
+  // one permute (the last load in a series has extra cost, but we're
+  // neglecting that here). Note that on the P7, we should do unaligned loads
+  // for Altivec types using the VSX instructions, but that's more expensive
+  // than using the permutation-based load sequence. On the P8, that's no
+  // longer true.
+  if (Opcode == Instruction::Load &&
+      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+      Alignment >= LT.second.getScalarType().getStoreSize())
+    return Cost + LT.first; // Add the cost of the permutations.
+
   // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
   // P7, unaligned vector loads are more expensive than the permutation-based
   // load sequence, so that might be used instead, but regardless, the net cost
@@ -340,14 +352,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
     return Cost;
 
-  // If we can use the permutation-based load sequence, then this is also
-  // relatively cheap (not counting loop-invariant instructions).
-  bool PermutationLoad = Opcode == Instruction::Load &&
-                         (IsAltivecType || IsQPXType) &&
-                         Alignment >= LT.second.getScalarType().getStoreSize();
-  if (PermutationLoad)
-    return Cost;
-
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
 
diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll
index 9501deedeaa..0a568b88e72 100644
--- a/test/Analysis/CostModel/PowerPC/load_store.ll
+++ b/test/Analysis/CostModel/PowerPC/load_store.ll
@@ -34,7 +34,7 @@ define i32 @loads(i32 %arg) {
   ; CHECK: cost of 48 {{.*}} load
   load <4 x i16>, <4 x i16>* undef, align 2
 
-  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK: cost of 2 {{.*}} load
   load <4 x i32>, <4 x i32>* undef, align 4
 
   ; CHECK: cost of 46 {{.*}} load
diff --git a/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll b/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
index b983d84ce72..3b1bc3b3fdb 100644
--- a/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
+++ b/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll
@@ -8,7 +8,7 @@ entry:
   ret <16 x i8> %r
 
 ; CHECK-LABEL: test_l_v16i8
-; CHECK: cost of 1 for instruction:   %r = load <16 x i8>, <16 x i8>* %p, align 1
+; CHECK: cost of 2 for instruction:   %r = load <16 x i8>, <16 x i8>* %p, align 1
 }
 
 define <32 x i8> @test_l_v32i8(<32 x i8>* %p) #0 {
@@ -17,7 +17,7 @@ entry:
   ret <32 x i8> %r
 
 ; CHECK-LABEL: test_l_v32i8
-; CHECK: cost of 2 for instruction:   %r = load <32 x i8>, <32 x i8>* %p, align 1
+; CHECK: cost of 4 for instruction:   %r = load <32 x i8>, <32 x i8>* %p, align 1
 }
 
 define <8 x i16> @test_l_v8i16(<8 x i16>* %p) #0 {
@@ -26,7 +26,7 @@ entry:
   ret <8 x i16> %r
 
 ; CHECK-LABEL: test_l_v8i16
-; CHECK: cost of 1 for instruction:   %r = load <8 x i16>, <8 x i16>* %p, align 2
+; CHECK: cost of 2 for instruction:   %r = load <8 x i16>, <8 x i16>* %p, align 2
 }
 
 define <16 x i16> @test_l_v16i16(<16 x i16>* %p) #0 {
@@ -35,7 +35,7 @@ entry:
   ret <16 x i16> %r
 
 ; CHECK-LABEL: test_l_v16i16
-; CHECK: cost of 2 for instruction:   %r = load <16 x i16>, <16 x i16>* %p, align 2
+; CHECK: cost of 4 for instruction:   %r = load <16 x i16>, <16 x i16>* %p, align 2
 }
 
 define <4 x i32> @test_l_v4i32(<4 x i32>* %p) #0 {
@@ -44,7 +44,7 @@ entry:
   ret <4 x i32> %r
 
 ; CHECK-LABEL: test_l_v4i32
-; CHECK: cost of 1 for instruction:   %r = load <4 x i32>, <4 x i32>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x i32>, <4 x i32>* %p, align 4
 }
 
 define <8 x i32> @test_l_v8i32(<8 x i32>* %p) #0 {
@@ -53,7 +53,7 @@ entry:
   ret <8 x i32> %r
 
 ; CHECK-LABEL: test_l_v8i32
-; CHECK: cost of 2 for instruction:   %r = load <8 x i32>, <8 x i32>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x i32>, <8 x i32>* %p, align 4
 }
 
 define <2 x i64> @test_l_v2i64(<2 x i64>* %p) #0 {
@@ -80,7 +80,7 @@ entry:
   ret <4 x float> %r
 
 ; CHECK-LABEL: test_l_v4float
-; CHECK: cost of 1 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
 }
 
 define <8 x float> @test_l_v8float(<8 x float>* %p) #0 {
@@ -89,7 +89,7 @@ entry:
   ret <8 x float> %r
 
 ; CHECK-LABEL: test_l_v8float
-; CHECK: cost of 2 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
 }
 
 define <2 x double> @test_l_v2double(<2 x double>* %p) #0 {
@@ -224,7 +224,7 @@ entry:
   ret <4 x float> %r
 
 ; CHECK-LABEL: test_l_qv4float
-; CHECK: cost of 1 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction:   %r = load <4 x float>, <4 x float>* %p, align 4
 }
 
 define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 {
@@ -233,7 +233,7 @@ entry:
   ret <8 x float> %r
 
 ; CHECK-LABEL: test_l_qv8float
-; CHECK: cost of 2 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction:   %r = load <8 x float>, <8 x float>* %p, align 4
 }
 
 define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 {
@@ -242,7 +242,7 @@ entry:
   ret <4 x double> %r
 
 ; CHECK-LABEL: test_l_qv4double
-; CHECK: cost of 1 for instruction:   %r = load <4 x double>, <4 x double>* %p, align 8
+; CHECK: cost of 2 for instruction:   %r = load <4 x double>, <4 x double>* %p, align 8
 }
 
 define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 {
@@ -251,7 +251,7 @@ entry:
   ret <8 x double> %r
 
 ; CHECK-LABEL: test_l_qv8double
-; CHECK: cost of 2 for instruction:   %r = load <8 x double>, <8 x double>* %p, align 8
+; CHECK: cost of 4 for instruction:   %r = load <8 x double>, <8 x double>* %p, align 8
 }
 
 define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
-- 
2.34.1