Use vld1/vst1 to load/store f64 if alignment is < 4 and the target allows unaligned...

author Evan Cheng <evan.cheng@apple.com>

Wed, 15 Aug 2012 17:44:53 +0000 (17:44 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Wed, 15 Aug 2012 17:44:53 +0000 (17:44 +0000)
author Evan Cheng <evan.cheng@apple.com>
Wed, 15 Aug 2012 17:44:53 +0000 (17:44 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Wed, 15 Aug 2012 17:44:53 +0000 (17:44 +0000)
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index c66618a8ef5f89fc589a4cd2a0d83eaa589a78b4..77181cfcac39c61d5bd677c37460f3e255a321f6 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -8808,6 +8808,8 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
    case MVT::i16:
    case MVT::i32:
      return true;
+  case MVT::f64:
+    return Subtarget->hasNEON();
    // FIXME: VLD1 etc with standard alignment is legal.
    }
  }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td

index fbd7e7b8e5b6f77c7640ecb9ddcd8e86e5a68ac9..76c897c7958a133c9de1ad077f037590aad9639c 100644 (file)
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -242,6 +242,9 @@ def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
  def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
                                   "Subtarget->isTargetDarwin()">;
  
+def IsLE             : Predicate<"TLI.isLittleEndian()">;
+def IsBE             : Predicate<"TLI.isBigEndian()">;
+
  //===----------------------------------------------------------------------===//
  // ARM Flag Definitions.
  
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td

index 31340881920d8e756ff31dc0124430f11fd7bbbb..048d340df00633d71a5340671b1edfe0578dd9b9 100644 (file)
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -398,6 +398,27 @@ def VecListFourQWordIndexed : Operand<i32> {
    let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
  }
  
+def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 2;
+}]>;
+def hword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 2;
+}]>;
+def byte_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 1;
+}]>;
+def byte_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 1;
+}]>;
+def non_word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                    (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
  
  //===----------------------------------------------------------------------===//
  // NEON-specific DAG Nodes.
@@ -2238,6 +2259,19 @@ def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
  
  } // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
  
+// Use vld1/vst1 for unaligned f64 load / store
+def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
+          (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>;
+def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
  
  //===----------------------------------------------------------------------===//
  // NEON pattern fragments
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td

index e3ab30419fac26f1ab861bf2815c19f1b5648865..eb7eaa6c970816f696b6e718cac9e0238aa97402 100644 (file)
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -61,6 +61,15 @@ def vfp_f64imm : Operand<f64>,
    let ParserMatchClass = FPImmOperand;
  }
  
+def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
  // The VCVT to/from fixed-point instructions encode the 'fbits' operand
  // (the number of fixed bits) differently than it appears in the assembly
  // source. It's encoded as "Size - fbits" where Size is the size of the
@@ -86,7 +95,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
  
  def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
                   IIC_fpLoad64, "vldr", "\t$Dd, $addr",
-                 [(set DPR:$Dd, (f64 (load addrmode5:$addr)))]>;
+                 [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>;
  
  def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
                   IIC_fpLoad32, "vldr", "\t$Sd, $addr",
@@ -100,7 +109,7 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
  
  def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
                   IIC_fpStore64, "vstr", "\t$Dd, $addr",
-                 [(store (f64 DPR:$Dd), addrmode5:$addr)]>;
+                 [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>;
  
  def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
                   IIC_fpStore32, "vstr", "\t$Sd, $addr",
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll

index a8237c60e4e0db90cc0ca45479cb10dd1d31765e..869b92675def6ccc713ae2f22a925856c8163d45 100644 (file)
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,25 +1,25 @@
-; RUN: llc < %s -march=arm -pre-RA-sched=source | FileCheck %s -check-prefix=GENERIC
-; RUN: llc < %s -mtriple=armv6-apple-darwin | FileCheck %s -check-prefix=DARWIN_V6
-; RUN: llc < %s -mtriple=armv6-apple-darwin -arm-strict-align | FileCheck %s -check-prefix=GENERIC
-; RUN: llc < %s -mtriple=armv6-linux | FileCheck %s -check-prefix=GENERIC
+; RUN: llc < %s -march=arm -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=UNALIGNED
  
  ; rdar://7113725
+; rdar://12091029
  
  define void @t(i8* nocapture %a, i8* nocapture %b) nounwind {
  entry:
-; GENERIC: t:
-; GENERIC: ldrb [[R2:r[0-9]+]]
-; GENERIC: ldrb [[R3:r[0-9]+]]
-; GENERIC: ldrb [[R12:r[0-9]+]]
-; GENERIC: ldrb [[R1:r[0-9]+]]
-; GENERIC: strb [[R1]]
-; GENERIC: strb [[R12]]
-; GENERIC: strb [[R3]]
-; GENERIC: strb [[R2]]
-
-; DARWIN_V6: t:
-; DARWIN_V6: ldr r1
-; DARWIN_V6: str r1
+; EXPANDED: t:
+; EXPANDED: ldrb [[R2:r[0-9]+]]
+; EXPANDED: ldrb [[R3:r[0-9]+]]
+; EXPANDED: ldrb [[R12:r[0-9]+]]
+; EXPANDED: ldrb [[R1:r[0-9]+]]
+; EXPANDED: strb [[R1]]
+; EXPANDED: strb [[R12]]
+; EXPANDED: strb [[R3]]
+; EXPANDED: strb [[R2]]
+
+; UNALIGNED: t:
+; UNALIGNED: ldr r1
+; UNALIGNED: str r1
  
    %__src1.i = bitcast i8* %b to i32*              ; <i32*> [#uses=1]
    %__dest2.i = bitcast i8* %a to i32*             ; <i32*> [#uses=1]
@@ -27,3 +27,35 @@ entry:
    store i32 %tmp.i, i32* %__dest2.i, align 1
    ret void
  }
+
+define void @hword(double* %a, double* %b) nounwind {
+entry:
+; EXPANDED: hword:
+; EXPANDED-NOT: vld1
+; EXPANDED: ldrh
+; EXPANDED-NOT: str1
+; EXPANDED: strh
+
+; UNALIGNED: hword:
+; UNALIGNED: vld1.16
+; UNALIGNED: vst1.16
+  %tmp = load double* %a, align 2
+  store double %tmp, double* %b, align 2
+  ret void
+}
+
+define void @byte(double* %a, double* %b) nounwind {
+entry:
+; EXPANDED: byte:
+; EXPANDED-NOT: vld1
+; EXPANDED: ldrb
+; EXPANDED-NOT: str1
+; EXPANDED: strb
+
+; UNALIGNED: byte:
+; UNALIGNED: vld1.8
+; UNALIGNED: vst1.8
+  %tmp = load double* %a, align 1
+  store double %tmp, double* %b, align 1
+  ret void
+}
author	Evan Cheng <evan.cheng@apple.com>
	Wed, 15 Aug 2012 17:44:53 +0000 (17:44 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Wed, 15 Aug 2012 17:44:53 +0000 (17:44 +0000)
lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
lib/Target/ARM/ARMInstrInfo.td		patch \| blob \| history
lib/Target/ARM/ARMInstrNEON.td		patch \| blob \| history
lib/Target/ARM/ARMInstrVFP.td		patch \| blob \| history
test/CodeGen/ARM/unaligned_load_store.ll		patch \| blob \| history