From: Bruno Cardoso Lopes Date: Tue, 2 Aug 2011 16:06:18 +0000 (+0000) Subject: Make this kind of lowering to be supported by 256-bit instructions: X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=ac5f13fe3f4b79fa188d407dc97de9c9f8fdf368;p=oota-llvm.git Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136691 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b00b00d047b..34a6dc3719e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4566,42 +4566,52 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, return SDValue(); } + // FIXME: 256-bit vector instructions don't require a strict alignment, + // improve this code to support it better. + unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); - // Make sure the stack object alignment is at least 16. + // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - if (DAG.InferPtrAlignment(Ptr) < 16) { + if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI->isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { - MFI->setObjectAlignment(FI, 16); + MFI->setObjectAlignment(FI, RequiredAlign); } } - // (Offset % 16) must be multiple of 4. Then address is then + // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); - if ((Offset % 16) & 3) + if ((Offset % RequiredAlign) & 3) return SDValue(); - int64_t StartOffset = Offset & ~15; + int64_t StartOffset = Offset & ~(RequiredAlign-1); if (StartOffset) Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; - int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; - EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; - SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, + int NumElems = VT.getVectorNumElements(); + + EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; + EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); + SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), false, false, 0); - // Canonicalize it to a v4i32 shuffle. - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(MVT::v4i32, dl, V1, - DAG.getUNDEF(MVT::v4i32),&Mask[0])); + + // Canonicalize it to a v4i32 or v8i32 shuffle. + SmallVector Mask; + for (int i = 0; i < NumElems; ++i) + Mask.push_back(EltNo); + + V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); + return DAG.getNode(ISD::BITCAST, dl, NVT, + DAG.getVectorShuffle(CanonVT, dl, V1, + DAG.getUNDEF(CanonVT),&Mask[0])); } return SDValue(); diff --git a/test/CodeGen/X86/avx-256-splat.ll b/test/CodeGen/X86/avx-256-splat.ll index edc17b72917..36d469417f9 100644 --- a/test/CodeGen/X86/avx-256-splat.ll +++ b/test/CodeGen/X86/avx-256-splat.ll @@ -45,3 +45,35 @@ entry: %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3 ret <4 x double> %vecinit6.i } + +; Test this simple opt: +; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> +; To: +; shuffle (vload ptr)), undef, <1, 1, 1, 1> +; CHECK: vmovaps +; CHECK-NEXT: vpextrd +define void @funcE() nounwind { +allocas: + %udx495 = alloca [18 x [18 x float]], align 32 + br label %for_test505.preheader + +for_test505.preheader: ; preds = %for_test505.preheader, %allocas + br i1 undef, label %for_exit499, label %for_test505.preheader + +for_exit499: ; preds = %for_test505.preheader + br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247 + +load.i1247: ; preds = %for_exit499 + %ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1 + %ptr.i1237 = bitcast float* %ptr1227 to i32* + %val.i1238 = load i32* %ptr.i1237, align 4 + %ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6 + %ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7 + %phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float> + br label %__load_and_broadcast_32.exit1249 + +__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499 + %load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ] + ret void +} +