From 11add26ec2bdf5109f0ff2ee19d237664687b914 Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@apple.com>
Date: Fri, 11 Nov 2011 23:31:03 +0000
Subject: [PATCH] Add support in fast-isel for selecting memset/memcpy/memmove
 intrinsics.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144426 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMFastISel.cpp          | 70 ++++++++++++++++++----
 test/CodeGen/ARM/fast-isel-intrinsic.ll | 78 +++++++++++++++++++++++++
 2 files changed, 138 insertions(+), 10 deletions(-)
 create mode 100644 test/CodeGen/ARM/fast-isel-intrinsic.ll
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index da2ca3ecd93..4bf55fb8f38 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -164,7 +164,8 @@ class ARMFastISel : public FastISel {
     bool SelectFPToSI(const Instruction *I);
     bool SelectSDiv(const Instruction *I);
     bool SelectSRem(const Instruction *I);
-    bool SelectCall(const Instruction *I);
+    bool SelectCall(const Instruction *I, const char *IntrMemName);
+    bool SelectIntrinsicCall(const IntrinsicInst &I);
     bool SelectSelect(const Instruction *I);
     bool SelectRet(const Instruction *I);
     bool SelectTrunc(const Instruction *I);
@@ -1997,12 +1998,13 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   return true;
 }
 
-bool ARMFastISel::SelectCall(const Instruction *I) {
+bool ARMFastISel::SelectCall(const Instruction *I,
+                             const char *IntrMemName = 0) {
   const CallInst *CI = cast<CallInst>(I);
   const Value *Callee = CI->getCalledValue();
 
-  // Can't handle inline asm or worry about intrinsics yet.
-  if (isa<InlineAsm>(Callee) || isa<IntrinsicInst>(CI)) return false;
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee)) return false;
 
   // Only handle global variable Callees.
   const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
@@ -2044,8 +2046,12 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   ArgFlags.reserve(CS.arg_size());
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
-    unsigned Arg = getRegForValue(*i);
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e-i <= 2)
+      break;
 
+    unsigned Arg = getRegForValue(*i);
     if (Arg == 0)
       return false;
     ISD::ArgFlagsTy Flags;
@@ -2090,14 +2096,16 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   if(isThumb2)
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                         TII.get(CallOpc)))
-          .addGlobalAddress(GV, 0, 0);
+                                 TII.get(CallOpc)));
   else
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                         TII.get(CallOpc))
-          .addGlobalAddress(GV, 0, 0));
-
+                                 TII.get(CallOpc)));
+  if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else 
+    MIB.addExternalSymbol(IntrMemName, 0);
+  
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
@@ -2112,6 +2120,46 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   return true;
 }
 
+bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    // FIXME: Small memcpy/memmove's are common enough that we want to do them
+    // without a call if possible.
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+    
+    if (!MTI.getLength()->getType()->isIntegerTy(32))
+      return false;
+    
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+    
+    if (!MSI.getLength()->getType()->isIntegerTy(32))
+      return false;
+    
+    if (MSI.getDestAddressSpace() > 255)
+      return false;
+    
+    return SelectCall(&I, "memset");
+  }
+  }
+  return false;    
+}
+
 bool ARMFastISel::SelectTrunc(const Instruction *I) {
   // The high bits for a type smaller than the register size are assumed to be 
   // undefined.
@@ -2235,6 +2283,8 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
     case Instruction::SRem:
       return SelectSRem(I);
     case Instruction::Call:
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+        return SelectIntrinsicCall(*II);
       return SelectCall(I);
     case Instruction::Select:
       return SelectSelect(I);
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
new file mode 100644
index 00000000000..1954eccc5f9
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -0,0 +1,78 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-darwin | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=THUMB
+
+@message1 = global [60 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 1
+@temp = common global [60 x i8] zeroinitializer, align 1
+
+define void @t1() nounwind ssp {
+; ARM: t1
+; ARM: ldr r0, LCPI0_0
+; ARM: add r0, r0, #5
+; ARM: movw r1, #64
+; ARM: movw r2, #10
+; ARM: uxtb r1, r1
+; ARM: bl #14
+; THUMB: t1
+; THUMB: ldr.n r0, LCPI0_0
+; THUMB: adds r0, #5
+; THUMB: movs r1, #64
+; THUMB: movt r1, #0
+; THUMB: movs r2, #10
+; THUMB: movt r2, #0
+; THUMB: uxtb r1, r1
+; THUMB: bl _memset
+  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @message1, i32 0, i32 5), i8 64, i32 10, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define void @t2() nounwind ssp {
+; ARM: t2
+; ARM: ldr r0, LCPI1_0
+; ARM: ldr r0, [r0]
+; ARM: add r1, r0, #4
+; ARM: add r0, r0, #16
+; ARM: movw r2, #10
+; ARM: str r0, [sp]                @ 4-byte Spill
+; ARM: mov r0, r1
+; ARM: ldr r1, [sp]                @ 4-byte Reload
+; ARM: bl #14
+; THUMB: t2
+; THUMB: ldr.n r0, LCPI1_0
+; THUMB: ldr r0, [r0]
+; THUMB: adds r1, r0, #4
+; THUMB: adds r0, #16
+; THUMB: movs r2, #10
+; THUMB: movt r2, #0
+; THUMB: mov r0, r1
+; THUMB: bl _memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define void @t3() nounwind ssp {
+; ARM: t3
+; ARM: ldr r0, LCPI2_0
+; ARM: ldr r0, [r0]
+; ARM: add r1, r0, #4
+; ARM: add r0, r0, #16
+; ARM: movw r2, #10
+; ARM: mov r0, r1
+; ARM: bl #14
+; THUMB: t3
+; THUMB: ldr.n r0, LCPI2_0
+; THUMB: ldr r0, [r0]
+; THUMB: adds r1, r0, #4
+; THUMB: adds r0, #16
+; THUMB: movs r2, #10
+; THUMB: movt r2, #0
+; THUMB: mov r0, r1
+; THUMB: bl _memmove
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
-- 
2.34.1