MaxStoresPerMemcpyOptSize = 8;
MaxStoresPerMemmove = 32;
MaxStoresPerMemmoveOptSize = 8;
+ } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) {
+ // The A2 also benefits from (very) aggressive inlining of memcpy and
+ // friends. The overhead of a the function call, even when warm, can be
+ // over one hundred cycles.
+ MaxStoresPerMemset = 128;
+ MaxStoresPerMemcpy = 128;
+ MaxStoresPerMemmove = 128;
}
}
bool IsMemset, bool ZeroMemset,
bool MemcpyStrSrc,
MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+ // When expanding a memset, require at least two QPX instructions to cover
+ // the cost of loading the value to be stored from the constant pool.
+ if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
+ (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
+ !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ return MVT::v4f64;
+ }
+
+ // We should use Altivec/VSX loads and stores when available. For unaligned
+ // addresses, unaligned VSX loads are only fast starting with the P8.
+ if (Subtarget.hasAltivec() && Size >= 16 &&
+ (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
+ ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
+ return MVT::v4i32;
+
if (Subtarget.isPPC64()) {
return MVT::i64;
- } else {
- return MVT::i32;
}
+
+ return MVT::i32;
}
/// \brief Returns true if it is beneficial to convert a load of a constant
--- /dev/null
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR7
+; RUN: llc -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR8
+; RUN: llc -mcpu=a2q < %s | FileCheck %s -check-prefix=A2Q
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo1(double* nocapture %x, double* nocapture readonly %y) #0 {
+entry:
+ %0 = bitcast double* %x to i8*
+ %1 = bitcast double* %y to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 8, i1 false)
+ ret void
+
+; PWR7-LABEL: @foo1
+; PWR7-NOT: bl memcpy
+; PWR7: ld {{[0-9]+}}, {{[0-9]+}}(4)
+; PWR7: std {{[0-9]+}}, {{[0-9]+}}(3)
+; PWR7: blr
+
+; PWR8-LABEL: @foo1
+; PWR8: lxvw4x
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @foo1
+; A2Q-NOT: bl memcpy
+; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4)
+; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3)
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+; Function Attrs: nounwind
+define void @foo2(double* nocapture %x, double* nocapture readonly %y) #0 {
+entry:
+ %0 = bitcast double* %x to i8*
+ %1 = bitcast double* %y to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 128, i32 8, i1 false)
+ ret void
+
+; PWR7-LABEL: @foo2
+; PWR7: bl memcpy
+; PWR7: blr
+
+; PWR8-LABEL: @foo2
+; PWR8: lxvw4x
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @foo2
+; A2Q-NOT: bl memcpy
+; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4)
+; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3)
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+define void @bar1(double* nocapture %x) #0 {
+entry:
+ %0 = bitcast double* %x to i8*
+ tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 128, i32 8, i1 false)
+ ret void
+
+; PWR7-LABEL: @bar1
+; PWR7-NOT: bl memset
+; PWR7: stxvw4x
+; PWR7: blr
+
+; PWR8-LABEL: @bar1
+; PWR8-NOT: bl memset
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @bar1
+; A2Q-NOT: bl memset
+; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3)
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+define void @bar2(double* nocapture %x) #0 {
+entry:
+ %0 = bitcast double* %x to i8*
+ tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 128, i32 32, i1 false)
+ ret void
+
+; PWR7-LABEL: @bar2
+; PWR7-NOT: bl memset
+; PWR7: stxvw4x
+; PWR7: blr
+
+; PWR8-LABEL: @bar2
+; PWR8-NOT: bl memset
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @bar2
+; A2Q-NOT: bl memset
+; A2Q: qvstfdx
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+
}
; CHECK-LABEL: @caller2
; CHECK: std 3, [[OFF:[0-9]+]](1)
-; CHECK: ld [[REG:[0-9]+]], [[OFF]](1)
-; CHECK: std [[REG]], 128(1)
+; CHECK: addi [[REG1:[0-9]+]], 1, [[OFF]]
+; CHECK: lxvw4x [[REG2:[0-9]+]], 0, [[REG1]]
+; CHECK: li [[REG3:[0-9]+]], 128
+; CHECK: stxvw4x 0, 1, [[REG3]]
; CHECK: bl test2