From: Sanjay Patel Date: Fri, 14 Aug 2015 17:53:40 +0000 (+0000) Subject: [x86] fix allowsMisalignedMemoryAccess() implementation X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=09ec2d3ce34ef1b1e1683b324428fd156542601d;p=oota-llvm.git [x86] fix allowsMisalignedMemoryAccess() implementation This patch fixes the x86 implementation of allowsMisalignedMemoryAccess() to correctly return the 'Fast' output parameter for 32-byte accesses. To test that, an existing load merging optimization is changed to use the TLI hook. This exposes a shortcoming in the current logic and results in the regression test update. Changing other direct users of the isUnalignedMem32Slow() x86 CPU attribute would be a follow-on patch. Without the fix in allowsMisalignedMemoryAccesses(), we will infinite loop when targeting SandyBridge because LowerINSERT_SUBVECTOR() creates 32-byte loads from two 16-byte loads while PerformLOADCombine() splits them back into 16-byte loads. Differential Revision: http://reviews.llvm.org/D10662 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245075 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9a790010f8c..68a15e021cd 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1915,8 +1915,14 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { - if (Fast) - *Fast = Subtarget->isUnalignedMemAccessFast(); + if (Fast) { + // FIXME: We should be checking 128-bit accesses separately from smaller + // accesses. + if (VT.getSizeInBits() == 256) + *Fast = !Subtarget->isUnalignedMem32Slow(); + else + *Fast = Subtarget->isUnalignedMemAccessFast(); + } return true; } @@ -11259,14 +11265,25 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // --> load32 addr if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector() && - !Subtarget->isUnalignedMem32Slow()) { - SDValue SubVec2 = Vec.getOperand(1); - if (auto *Idx2 = dyn_cast(Vec.getOperand(2))) { - if (Idx2->getZExtValue() == 0) { - SDValue Ops[] = { SubVec2, SubVec }; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; + OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + auto *Idx2 = dyn_cast(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through a bitcast to get to the load. + if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) + SubVec2 = SubVec2.getOperand(0); + + if (auto *FirstLd = dyn_cast(SubVec2)) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = { SubVec2, SubVec }; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } } } } diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll index d979c16f4ab..608c6e72fbe 100644 --- a/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -75,12 +75,12 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { ret <8 x float> %v3 } +; If the first load is 32-byte aligned, then the loads should be merged in all cases. + define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) { -;; FIXME: The first load is 32-byte aligned, so the second load should get merged. ; AVXSLOW-LABEL: combine_16_byte_loads_aligned: ; AVXSLOW: # BB#0: -; AVXSLOW-NEXT: vmovaps 48(%rdi), %xmm0 -; AVXSLOW-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0 ; AVXSLOW-NEXT: retq ; ; AVXFAST-LABEL: combine_16_byte_loads_aligned: