From 915e5e56d7cc8e140d33202eed6244ed0356ed1f Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Thu, 12 Feb 2004 17:53:22 +0000 Subject: [PATCH] Add support for the rep movs[bwd] instructions, and emit them when code generating the llvm.memcpy intrinsic. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@11351 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/InstSelectSimple.cpp | 53 +++++++++++++++++++++++++++++ lib/Target/X86/X86CodeEmitter.cpp | 4 +++ lib/Target/X86/X86ISelSimple.cpp | 53 +++++++++++++++++++++++++++++ lib/Target/X86/X86InstrInfo.h | 18 ++++++---- lib/Target/X86/X86InstrInfo.td | 25 +++++++++----- 5 files changed, 138 insertions(+), 15 deletions(-) diff --git a/lib/Target/X86/InstSelectSimple.cpp b/lib/Target/X86/InstSelectSimple.cpp index 9df7697344d..67849e2edbb 100644 --- a/lib/Target/X86/InstSelectSimple.cpp +++ b/lib/Target/X86/InstSelectSimple.cpp @@ -1157,6 +1157,7 @@ void ISel::LowerUnknownIntrinsicFunctionCalls(Function &F) { case Intrinsic::va_start: case Intrinsic::va_copy: case Intrinsic::va_end: + case Intrinsic::memcpy: // We directly implement these intrinsics break; default: @@ -1188,6 +1189,58 @@ void ISel::visitIntrinsicCall(Intrinsic::ID ID, CallInst &CI) { return; case Intrinsic::va_end: return; // Noop on X86 + case Intrinsic::memcpy: { + assert(CI.getNumOperands() == 5 && "Illegal llvm.memcpy call!"); + unsigned Align = 1; + if (ConstantInt *AlignC = dyn_cast(CI.getOperand(4))) { + Align = AlignC->getRawValue(); + if (Align == 0) Align = 1; + } + + // Turn the byte code into # iterations + unsigned ByteReg = getReg(CI.getOperand(3)); + unsigned CountReg; + + switch (Align & 3) { + case 2: // WORD aligned + CountReg = makeAnotherReg(Type::IntTy); + BuildMI(BB, X86::SHRir32, 2, CountReg).addReg(ByteReg).addZImm(1); + break; + case 0: // DWORD aligned + CountReg = makeAnotherReg(Type::IntTy); + BuildMI(BB, X86::SHRir32, 2, CountReg).addReg(ByteReg).addZImm(2); + break; + case 1: // BYTE aligned + case 3: // BYTE aligned + CountReg = ByteReg; + break; + } + + // No matter what the alignment is, we put the source in ESI, the + // destination in EDI, and the count in ECX. + TmpReg1 = getReg(CI.getOperand(1)); + TmpReg2 = getReg(CI.getOperand(2)); + BuildMI(BB, X86::MOVrr32, 1, X86::ECX).addReg(CountReg); + BuildMI(BB, X86::MOVrr32, 1, X86::EDI).addReg(TmpReg1); + BuildMI(BB, X86::MOVrr32, 1, X86::ESI).addReg(TmpReg2); + + unsigned Bytes = getReg(CI.getOperand(3)); + switch (Align & 3) { + case 1: // BYTE aligned + case 3: // BYTE aligned + BuildMI(BB, X86::REP_MOVSB, 0); + break; + case 2: // WORD aligned + BuildMI(BB, X86::REP_MOVSW, 0); + break; + case 0: // DWORD aligned + BuildMI(BB, X86::REP_MOVSD, 0); + break; + } + + return; + } + default: assert(0 && "Error: unknown intrinsics should have been lowered!"); } } diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index be6319cd0c5..83e5e102d59 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -470,6 +470,9 @@ void Emitter::emitInstruction(MachineInstr &MI) { unsigned Opcode = MI.getOpcode(); const TargetInstrDescriptor &Desc = II->get(Opcode); + // Emit the repeat opcode prefix as needed. + if ((Desc.TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3); + // Emit instruction prefixes if necessary if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);// Operand size... @@ -477,6 +480,7 @@ void Emitter::emitInstruction(MachineInstr &MI) { case X86II::TB: MCE.emitByte(0x0F); // Two-byte opcode prefix break; + case X86II::REP: break; // already handled. case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB: case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF: MCE.emitByte(0xD8+ diff --git a/lib/Target/X86/X86ISelSimple.cpp b/lib/Target/X86/X86ISelSimple.cpp index 9df7697344d..67849e2edbb 100644 --- a/lib/Target/X86/X86ISelSimple.cpp +++ b/lib/Target/X86/X86ISelSimple.cpp @@ -1157,6 +1157,7 @@ void ISel::LowerUnknownIntrinsicFunctionCalls(Function &F) { case Intrinsic::va_start: case Intrinsic::va_copy: case Intrinsic::va_end: + case Intrinsic::memcpy: // We directly implement these intrinsics break; default: @@ -1188,6 +1189,58 @@ void ISel::visitIntrinsicCall(Intrinsic::ID ID, CallInst &CI) { return; case Intrinsic::va_end: return; // Noop on X86 + case Intrinsic::memcpy: { + assert(CI.getNumOperands() == 5 && "Illegal llvm.memcpy call!"); + unsigned Align = 1; + if (ConstantInt *AlignC = dyn_cast(CI.getOperand(4))) { + Align = AlignC->getRawValue(); + if (Align == 0) Align = 1; + } + + // Turn the byte code into # iterations + unsigned ByteReg = getReg(CI.getOperand(3)); + unsigned CountReg; + + switch (Align & 3) { + case 2: // WORD aligned + CountReg = makeAnotherReg(Type::IntTy); + BuildMI(BB, X86::SHRir32, 2, CountReg).addReg(ByteReg).addZImm(1); + break; + case 0: // DWORD aligned + CountReg = makeAnotherReg(Type::IntTy); + BuildMI(BB, X86::SHRir32, 2, CountReg).addReg(ByteReg).addZImm(2); + break; + case 1: // BYTE aligned + case 3: // BYTE aligned + CountReg = ByteReg; + break; + } + + // No matter what the alignment is, we put the source in ESI, the + // destination in EDI, and the count in ECX. + TmpReg1 = getReg(CI.getOperand(1)); + TmpReg2 = getReg(CI.getOperand(2)); + BuildMI(BB, X86::MOVrr32, 1, X86::ECX).addReg(CountReg); + BuildMI(BB, X86::MOVrr32, 1, X86::EDI).addReg(TmpReg1); + BuildMI(BB, X86::MOVrr32, 1, X86::ESI).addReg(TmpReg2); + + unsigned Bytes = getReg(CI.getOperand(3)); + switch (Align & 3) { + case 1: // BYTE aligned + case 3: // BYTE aligned + BuildMI(BB, X86::REP_MOVSB, 0); + break; + case 2: // WORD aligned + BuildMI(BB, X86::REP_MOVSW, 0); + break; + case 0: // DWORD aligned + BuildMI(BB, X86::REP_MOVSD, 0); + break; + } + + return; + } + default: assert(0 && "Error: unknown intrinsics should have been lowered!"); } } diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 98f9fe68d72..c6e3b761766 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -86,9 +86,9 @@ namespace X86II { OpSize = 1 << 5, // Op0Mask - There are several prefix bytes that are used to form two byte - // opcodes. These are currently 0x0F, and 0xD8-0xDF. This mask is used to - // obtain the setting of this field. If no bits in this field is set, there - // is no prefix byte for obtaining a multibyte opcode. + // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is + // used to obtain the setting of this field. If no bits in this field is + // set, there is no prefix byte for obtaining a multibyte opcode. // Op0Shift = 6, Op0Mask = 0xF << Op0Shift, @@ -97,12 +97,16 @@ namespace X86II { // starts with a 0x0F byte before the real opcode. TB = 1 << Op0Shift, + // REP - The 0xF3 prefix byte indicating repetition of the following + // instruction. + REP = 2 << Op0Shift, + // D8-DF - These escape opcodes are used by the floating point unit. These // values must remain sequential. - D8 = 2 << Op0Shift, D9 = 3 << Op0Shift, - DA = 4 << Op0Shift, DB = 5 << Op0Shift, - DC = 6 << Op0Shift, DD = 7 << Op0Shift, - DE = 8 << Op0Shift, DF = 9 << Op0Shift, + D8 = 3 << Op0Shift, D9 = 4 << Op0Shift, + DA = 5 << Op0Shift, DB = 6 << Op0Shift, + DC = 7 << Op0Shift, DD = 8 << Op0Shift, + DE = 9 << Op0Shift, DF = 10 << Op0Shift, //===------------------------------------------------------------------===// // This three-bit field describes the size of a memory operand. Zero is diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 92f193f9850..4bb1a9550a9 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -92,14 +92,15 @@ class Pattern { // emitter that various prefix bytes are required. class OpSize { bit hasOpSizePrefix = 1; } class TB { bits<4> Prefix = 1; } -class D8 { bits<4> Prefix = 2; } -class D9 { bits<4> Prefix = 3; } -class DA { bits<4> Prefix = 4; } -class DB { bits<4> Prefix = 5; } -class DC { bits<4> Prefix = 6; } -class DD { bits<4> Prefix = 7; } -class DE { bits<4> Prefix = 8; } -class DF { bits<4> Prefix = 9; } +class REP { bits<4> Prefix = 2; } +class D8 { bits<4> Prefix = 3; } +class D9 { bits<4> Prefix = 4; } +class DA { bits<4> Prefix = 5; } +class DB { bits<4> Prefix = 6; } +class DC { bits<4> Prefix = 7; } +class DD { bits<4> Prefix = 8; } +class DE { bits<4> Prefix = 9; } +class DF { bits<4> Prefix = 10; } @@ -172,6 +173,14 @@ def XCHGrr32 : X86Inst<"xchg", 0x87, MRMDestReg, Arg32>; // xchg R32, R32 def LEAr16 : X86Inst<"lea", 0x8D, MRMSrcMem, Arg16>, OpSize; // R16 = lea [mem] def LEAr32 : X86Inst<"lea", 0x8D, MRMSrcMem, Arg32>; // R32 = lea [mem] + +def REP_MOVSB : X86Inst<"rep movsb", 0xA4, RawFrm, NoArg>, REP, + Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>; +def REP_MOVSW : X86Inst<"rep movsw", 0xA5, RawFrm, NoArg>, REP, OpSize, + Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>; +def REP_MOVSD : X86Inst<"rep movsd", 0xA5, RawFrm, NoArg>, REP, + Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>; + //===----------------------------------------------------------------------===// // Move Instructions... // -- 2.34.1