Revise alignment checking/calculation on 256-bit unaligned memory access

author Michael Liao <michael.liao@intel.com>

Mon, 25 Mar 2013 23:50:10 +0000 (23:50 +0000)

committer Michael Liao <michael.liao@intel.com>

Mon, 25 Mar 2013 23:50:10 +0000 (23:50 +0000)
author Michael Liao <michael.liao@intel.com>
Mon, 25 Mar 2013 23:50:10 +0000 (23:50 +0000)
committer Michael Liao <michael.liao@intel.com>
Mon, 25 Mar 2013 23:50:10 +0000 (23:50 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 23cfd6d72f29da72b68afaee16d27ca5754c8e7f..fef2b9659b3c60d88172974b65aa4e5587ba4399 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16639,11 +16639,10 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    unsigned RegSz = RegVT.getSizeInBits();
  
+  // On Sandybridge unaligned 256bit loads are inefficient.
    ISD::LoadExtType Ext = Ld->getExtensionType();
    unsigned Alignment = Ld->getAlignment();
-  bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8;
-
-  // On Sandybridge unaligned 256bit loads are inefficient.
+  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
    if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
        !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
      unsigned NumElems = RegVT.getVectorNumElements();
@@ -16663,7 +16662,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
                                  Ld->getPointerInfo(), Ld->isVolatile(),
                                  Ld->isNonTemporal(), Ld->isInvariant(),
-                                std::max(Alignment/2U, 1U));
+                                std::min(16U, Alignment));
      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                               Load1.getValue(1),
                               Load2.getValue(1));
@@ -16834,13 +16833,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    DebugLoc dl = St->getDebugLoc();
    SDValue StoredVal = St->getOperand(1);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned Alignment = St->getAlignment();
-  bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
  
    // If we are saving a concatenation of two XMM registers, perform two stores.
    // On Sandy Bridge, 256-bit memory operations are executed by two
    // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
    // memory  operation.
+  unsigned Alignment = St->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
    if (VT.is256BitVector() && !Subtarget->hasInt256() &&
        StVT == VT && !IsAligned) {
      unsigned NumElems = VT.getVectorNumElements();
@@ -16860,7 +16859,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
                                  St->getPointerInfo(), St->isVolatile(),
                                  St->isNonTemporal(),
-                                std::max(Alignment/2U, 1U));
+                                std::min(16U, Alignment));
      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
    }
  
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll

index 0afaff830df09f1f33a6310dbfdfcb540dd63cc4..a6775aba09893522b34948682d7d94520c13655d 100644 (file)
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -81,7 +81,7 @@ define void @storev32i8_01(<32 x i8> %a) nounwind {
  ; CHECK: _double_save
  ; CHECK-NOT: vinsertf128 $1
  ; CHECK-NOT: vinsertf128 $0
-; CHECK: vmovups %xmm
+; CHECK: vmovaps %xmm
  ; CHECK: vmovaps %xmm
  define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
  entry:
@@ -127,3 +127,25 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
    store <8 x i32> %x, <8 x i32>* %ret, align 1
    ret void
  }
+
+; CHECK: add4i64a64
+; CHECK: vmovaps ({{.*}}), %ymm{{.*}}
+; CHECK: vmovaps %ymm{{.*}}, ({{.*}})
+define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
+  %b = load <4 x i64>* %bp, align 64
+  %x = add <4 x i64> zeroinitializer, %b
+  store <4 x i64> %x, <4 x i64>* %ret, align 64
+  ret void
+}
+
+; CHECK: add4i64a16
+; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
+; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
+; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
+; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
+define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
+  %b = load <4 x i64>* %bp, align 16
+  %x = add <4 x i64> zeroinitializer, %b
+  store <4 x i64> %x, <4 x i64>* %ret, align 16
+  ret void
+}
author	Michael Liao <michael.liao@intel.com>
	Mon, 25 Mar 2013 23:50:10 +0000 (23:50 +0000)
committer	Michael Liao <michael.liao@intel.com>
	Mon, 25 Mar 2013 23:50:10 +0000 (23:50 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/avx-load-store.ll		patch \| blob \| history