bool IsQPXType = ST->hasQPX() &&
(LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
+ // If we can use the permutation-based load sequence, then this is also
+ // relatively cheap (not counting loop-invariant instructions): one load plus
+ // one permute (the last load in a series has extra cost, but we're
+ // neglecting that here). Note that on the P7, we should do unaligned loads
+ // for Altivec types using the VSX instructions, but that's more expensive
+ // than using the permutation-based load sequence. On the P8, that's no
+ // longer true.
+ if (Opcode == Instruction::Load &&
+ ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+ Alignment >= LT.second.getScalarType().getStoreSize())
+ return Cost + LT.first; // Add the cost of the permutations.
+
// For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
// P7, unaligned vector loads are more expensive than the permutation-based
// load sequence, so that might be used instead, but regardless, the net cost
if (IsVSXType || (ST->hasVSX() && IsAltivecType))
return Cost;
- // If we can use the permutation-based load sequence, then this is also
- // relatively cheap (not counting loop-invariant instructions).
- bool PermutationLoad = Opcode == Instruction::Load &&
- (IsAltivecType || IsQPXType) &&
- Alignment >= LT.second.getScalarType().getStoreSize();
- if (PermutationLoad)
- return Cost;
-
// PPC in general does not support unaligned loads and stores. They'll need
// to be decomposed based on the alignment factor.
ret <16 x i8> %r
; CHECK-LABEL: test_l_v16i8
-; CHECK: cost of 1 for instruction: %r = load <16 x i8>, <16 x i8>* %p, align 1
+; CHECK: cost of 2 for instruction: %r = load <16 x i8>, <16 x i8>* %p, align 1
}
define <32 x i8> @test_l_v32i8(<32 x i8>* %p) #0 {
ret <32 x i8> %r
; CHECK-LABEL: test_l_v32i8
-; CHECK: cost of 2 for instruction: %r = load <32 x i8>, <32 x i8>* %p, align 1
+; CHECK: cost of 4 for instruction: %r = load <32 x i8>, <32 x i8>* %p, align 1
}
define <8 x i16> @test_l_v8i16(<8 x i16>* %p) #0 {
ret <8 x i16> %r
; CHECK-LABEL: test_l_v8i16
-; CHECK: cost of 1 for instruction: %r = load <8 x i16>, <8 x i16>* %p, align 2
+; CHECK: cost of 2 for instruction: %r = load <8 x i16>, <8 x i16>* %p, align 2
}
define <16 x i16> @test_l_v16i16(<16 x i16>* %p) #0 {
ret <16 x i16> %r
; CHECK-LABEL: test_l_v16i16
-; CHECK: cost of 2 for instruction: %r = load <16 x i16>, <16 x i16>* %p, align 2
+; CHECK: cost of 4 for instruction: %r = load <16 x i16>, <16 x i16>* %p, align 2
}
define <4 x i32> @test_l_v4i32(<4 x i32>* %p) #0 {
ret <4 x i32> %r
; CHECK-LABEL: test_l_v4i32
-; CHECK: cost of 1 for instruction: %r = load <4 x i32>, <4 x i32>* %p, align 4
+; CHECK: cost of 2 for instruction: %r = load <4 x i32>, <4 x i32>* %p, align 4
}
define <8 x i32> @test_l_v8i32(<8 x i32>* %p) #0 {
ret <8 x i32> %r
; CHECK-LABEL: test_l_v8i32
-; CHECK: cost of 2 for instruction: %r = load <8 x i32>, <8 x i32>* %p, align 4
+; CHECK: cost of 4 for instruction: %r = load <8 x i32>, <8 x i32>* %p, align 4
}
define <2 x i64> @test_l_v2i64(<2 x i64>* %p) #0 {
ret <4 x float> %r
; CHECK-LABEL: test_l_v4float
-; CHECK: cost of 1 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
}
define <8 x float> @test_l_v8float(<8 x float>* %p) #0 {
ret <8 x float> %r
; CHECK-LABEL: test_l_v8float
-; CHECK: cost of 2 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
}
define <2 x double> @test_l_v2double(<2 x double>* %p) #0 {
ret <4 x float> %r
; CHECK-LABEL: test_l_qv4float
-; CHECK: cost of 1 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
+; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4
}
define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 {
ret <8 x float> %r
; CHECK-LABEL: test_l_qv8float
-; CHECK: cost of 2 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
+; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4
}
define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 {
ret <4 x double> %r
; CHECK-LABEL: test_l_qv4double
-; CHECK: cost of 1 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8
+; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8
}
define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 {
ret <8 x double> %r
; CHECK-LABEL: test_l_qv8double
-; CHECK: cost of 2 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8
+; CHECK: cost of 4 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8
}
define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {