From 1887c1c2f99903d13406e723f2dcbab4511e3f49 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 21 Aug 2008 21:00:15 +0000 Subject: [PATCH] Fix a number of byval / memcpy / memset related codegen issues. 1. x86-64 byval alignment should be max of 8 and alignment of type. Previously the code was not doing what the commit message was saying. 2. Do not use byte repeat move and store operations. These are slow. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@55139 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 90 +++++++++++++------------- test/CodeGen/X86/2004-02-12-Memcpy.ll | 4 +- test/CodeGen/X86/byval3.ll | 4 +- test/CodeGen/X86/byval4.ll | 4 +- test/CodeGen/X86/byval5.ll | 2 +- test/CodeGen/X86/memset-2.ll | 45 +++++++++++++ test/CodeGen/X86/memset64-on-x86-32.ll | 12 ++-- 7 files changed, 103 insertions(+), 58 deletions(-) create mode 100644 test/CodeGen/X86/memset-2.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index fdc461894d6..a6bffe285f7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -796,8 +796,14 @@ static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { - if (Subtarget->is64Bit()) - return getTargetData()->getABITypeAlignment(Ty); + if (Subtarget->is64Bit()) { + // Max of 8 and alignment of type. + unsigned TyAlign = getTargetData()->getABITypeAlignment(Ty); + if (TyAlign > 8) + return TyAlign; + return 8; + } + unsigned Align = 4; if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); @@ -5014,16 +5020,16 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, const Value *DstSV, uint64_t DstSVOff) { ConstantSDNode *ConstantSize = dyn_cast(Size); /// If not DWORD aligned or size is more than the threshold, call the library. /// The libc version is likely to be faster for these cases. It can use the /// address value and run time information about the CPU. - if ((Align & 3) == 0 || + if ((Align & 3) != 0 || !ConstantSize || ConstantSize->getValue() > getSubtarget()->getMaxInlineSizeThreshold()) { SDValue InFlag(0, 0); @@ -5065,27 +5071,27 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, // If the value is a constant, then we can potentially use larger sets. switch (Align & 3) { - case 2: // WORD aligned - AVT = MVT::i16; - ValReg = X86::AX; - Val = (Val << 8) | Val; - break; - case 0: // DWORD aligned - AVT = MVT::i32; - ValReg = X86::EAX; - Val = (Val << 8) | Val; - Val = (Val << 16) | Val; - if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned - AVT = MVT::i64; - ValReg = X86::RAX; - Val = (Val << 32) | Val; - } - break; - default: // Byte aligned - AVT = MVT::i8; - ValReg = X86::AL; - Count = DAG.getIntPtrConstant(SizeVal); - break; + case 2: // WORD aligned + AVT = MVT::i16; + ValReg = X86::AX; + Val = (Val << 8) | Val; + break; + case 0: // DWORD aligned + AVT = MVT::i32; + ValReg = X86::EAX; + Val = (Val << 8) | Val; + Val = (Val << 16) | Val; + if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned + AVT = MVT::i64; + ValReg = X86::RAX; + Val = (Val << 32) | Val; + } + break; + default: // Byte aligned + AVT = MVT::i8; + ValReg = X86::AL; + Count = DAG.getIntPtrConstant(SizeVal); + break; } if (AVT.bitsGT(MVT::i8)) { @@ -5153,13 +5159,11 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, SDValue X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool AlwaysInline, - const Value *DstSV, uint64_t DstSVOff, - const Value *SrcSV, uint64_t SrcSVOff){ - + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool AlwaysInline, + const Value *DstSV, uint64_t DstSVOff, + const Value *SrcSV, uint64_t SrcSVOff) { // This requires the copy size to be a constant, preferrably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast(Size); @@ -5169,21 +5173,19 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) return SDValue(); - MVT AVT; - unsigned BytesLeft = 0; - if (Align >= 8 && Subtarget->is64Bit()) + /// If not DWORD aligned, call the library. + if ((Align & 3) != 0) + return SDValue(); + + // DWORD aligned + MVT AVT = MVT::i32; + if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned AVT = MVT::i64; - else if (Align >= 4) - AVT = MVT::i32; - else if (Align >= 2) - AVT = MVT::i16; - else - AVT = MVT::i8; unsigned UBytes = AVT.getSizeInBits() / 8; unsigned CountVal = SizeVal / UBytes; SDValue Count = DAG.getIntPtrConstant(CountVal); - BytesLeft = SizeVal % UBytes; + unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag(0, 0); Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX, diff --git a/test/CodeGen/X86/2004-02-12-Memcpy.ll b/test/CodeGen/X86/2004-02-12-Memcpy.ll index 59364c1f6d6..56bb21caf3c 100644 --- a/test/CodeGen/X86/2004-02-12-Memcpy.ll +++ b/test/CodeGen/X86/2004-02-12-Memcpy.ll @@ -1,11 +1,11 @@ -; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep movs | count 3 +; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep movs | count 1 @A = global [32 x i32] zeroinitializer @B = global [32 x i32] zeroinitializer declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) -define void @main() { +define void @main() nounwind { ; dword copy call void @llvm.memcpy.i32(i8* bitcast ([32 x i32]* @A to i8*), i8* bitcast ([32 x i32]* @B to i8*), diff --git a/test/CodeGen/X86/byval3.ll b/test/CodeGen/X86/byval3.ll index 074bab4c0a9..707a4c5d278 100644 --- a/test/CodeGen/X86/byval3.ll +++ b/test/CodeGen/X86/byval3.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsl | count 2 +; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsq | count 2 ; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2 %struct.s = type { i32, i32, i32, i32, i32, i32, i32, i32, @@ -7,7 +7,7 @@ i32, i32, i32, i32, i32, i32, i32, i32, i32 } -define void @g(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6) { +define void @g(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6) nounwind { entry: %d = alloca %struct.s, align 16 %tmp = getelementptr %struct.s* %d, i32 0, i32 0 diff --git a/test/CodeGen/X86/byval4.ll b/test/CodeGen/X86/byval4.ll index d2fa9e289e7..5576c361ae1 100644 --- a/test/CodeGen/X86/byval4.ll +++ b/test/CodeGen/X86/byval4.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsw | count 2 +; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsq | count 2 ; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2 %struct.s = type { i16, i16, i16, i16, i16, i16, i16, i16, @@ -13,7 +13,7 @@ define void @g(i16 signext %a1, i16 signext %a2, i16 signext %a3, - i16 signext %a4, i16 signext %a5, i16 signext %a6) { + i16 signext %a4, i16 signext %a5, i16 signext %a6) nounwind { entry: %a = alloca %struct.s, align 16 %tmp = getelementptr %struct.s* %a, i32 0, i32 0 diff --git a/test/CodeGen/X86/byval5.ll b/test/CodeGen/X86/byval5.ll index fd9c197bbfd..c6f4588dd45 100644 --- a/test/CodeGen/X86/byval5.ll +++ b/test/CodeGen/X86/byval5.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsb | count 2 +; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsq | count 2 ; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2 %struct.s = type { i8, i8, i8, i8, i8, i8, i8, i8, diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll new file mode 100644 index 00000000000..2ad665cda75 --- /dev/null +++ b/test/CodeGen/X86/memset-2.ll @@ -0,0 +1,45 @@ +; RUN: llvm-as < %s | llc -march=x86 | not grep rep +; RUN: llvm-as < %s | llc -march=x86 | grep memset + +declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind + +define fastcc i32 @cli_scanzip(i32 %desc) nounwind { +entry: + br label %bb8.i.i.i.i + +bb8.i.i.i.i: ; preds = %bb8.i.i.i.i, %entry + icmp eq i32 0, 0 ; :0 [#uses=1] + br i1 %0, label %bb61.i.i.i, label %bb8.i.i.i.i + +bb32.i.i.i: ; preds = %bb61.i.i.i + ptrtoint i8* %tail.0.i.i.i to i32 ; :1 [#uses=1] + sub i32 0, %1 ; :2 [#uses=1] + icmp sgt i32 %2, 19 ; :3 [#uses=1] + br i1 %3, label %bb34.i.i.i, label %bb61.i.i.i + +bb34.i.i.i: ; preds = %bb32.i.i.i + load i32* null, align 4 ; :4 [#uses=1] + icmp eq i32 %4, 101010256 ; :5 [#uses=1] + br i1 %5, label %bb8.i11.i.i.i, label %bb61.i.i.i + +bb8.i11.i.i.i: ; preds = %bb8.i11.i.i.i, %bb34.i.i.i + icmp eq i32 0, 0 ; :6 [#uses=1] + br i1 %6, label %cli_dbgmsg.exit49.i, label %bb8.i11.i.i.i + +cli_dbgmsg.exit49.i: ; preds = %bb8.i11.i.i.i + icmp eq [32768 x i8]* null, null ; :7 [#uses=1] + br i1 %7, label %bb1.i28.i, label %bb8.i.i + +bb61.i.i.i: ; preds = %bb61.i.i.i, %bb34.i.i.i, %bb32.i.i.i, %bb8.i.i.i.i + %tail.0.i.i.i = getelementptr [1024 x i8]* null, i32 0, i32 0 ; [#uses=2] + load i8* %tail.0.i.i.i, align 1 ; :8 [#uses=1] + icmp eq i8 %8, 80 ; :9 [#uses=1] + br i1 %9, label %bb32.i.i.i, label %bb61.i.i.i + +bb1.i28.i: ; preds = %cli_dbgmsg.exit49.i + call void @llvm.memset.i32( i8* null, i8 0, i32 88, i32 1 ) nounwind + unreachable + +bb8.i.i: ; preds = %bb8.i.i, %cli_dbgmsg.exit49.i + br label %bb8.i.i +} diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll index 7045c0faf0b..d76d4d47924 100644 --- a/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/test/CodeGen/X86/memset64-on-x86-32.ll @@ -1,12 +1,10 @@ -; RUN: llvm-as < %s | llc -march=x86 | grep stosb +; RUN: llvm-as < %s | llc -mtriple=i386-apple-darwin | grep stosl +; RUN: llvm-as < %s | llc -mtriple=x86_64-apple-darwin | grep movq | count 10 -target triple = "i386-apple-darwin9" - %struct.S = type { [80 x i8] } - -define %struct.S* @bork() { +define void @bork() nounwind { entry: - call void @llvm.memset.i64( i8* null, i8 0, i64 80, i32 1 ) - ret %struct.S* null + call void @llvm.memset.i64( i8* null, i8 0, i64 80, i32 4 ) + ret void } declare void @llvm.memset.i64(i8*, i8, i64, i32) nounwind -- 2.34.1