From: Luke Cheeseman Date: Fri, 24 Jul 2015 09:57:05 +0000 (+0000) Subject: [ARM] - Fix lowering of shufflevectors in AArch32 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=67b17f9ff89dcaea75802bf0d1186f4e6f189602;p=oota-llvm.git [ARM] - Fix lowering of shufflevectors in AArch32 Some shufflevectors are currently being incorrectly lowered in the AArch32 backend as the existing checks for detecting the NEON operations from the shufflevector instruction expects the shuffle mask and the vector operands to be of the same length. This is not always the case as the mask may be twice as long as the operand; here only the lower half of the shufflemask gets checked, so provided the lower half of the shufflemask looks like a vector transpose (or even is just all -1 for undef) then the intrinsics may get incorrectly lowered into a vector transpose (VTRN) instruction. This patch fixes this by accommodating for both cases and adds regression tests. Differential Revision: http://reviews.llvm.org/D11407 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243103 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index d77b722e62a..afc6ee8ad0e 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -5043,18 +5043,50 @@ static bool isVTBLMask(ArrayRef M, EVT VT) { return VT == MVT::v8i8 && M.size() == 8; } +// Checks whether the shuffle mask represents a vector transpose (VTRN) by +// checking that pairs of elements in the shuffle mask represent the same index +// in each vector, incrementing the expected index by 2 at each step. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} +// v2={e,f,g,h} +// WhichResult gives the offset for each element in the mask based on which +// of the two results it belongs to. +// +// The transpose can be represented either as: +// result1 = shufflevector v1, v2, result1_shuffle_mask +// result2 = shufflevector v1, v2, result2_shuffle_mask +// where v1/v2 and the shuffle masks have the same number of elements +// (here WhichResult (see below) indicates which result is being checked) +// +// or as: +// results = shufflevector v1, v2, shuffle_mask +// where both results are returned in one vector and the shuffle mask has twice +// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we +// want to check the low half and high half of the shuffle mask as if it were +// the other case static bool isVTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + // If the mask is twice as long as the result then we need to check the upper + // and lower parts of the mask + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } @@ -5067,28 +5099,52 @@ static bool isVTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } +// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking +// that the mask elements are either all even and in steps of size 2 or all odd +// and in steps of size 2. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with +// respect the how results are returned. static bool isVUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i != NumElts; ++i) { - if (M[i] < 0) continue; // ignore UNDEF indices - if ((unsigned) M[i] != 2 * i + WhichResult) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; ++j) { + if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) + return false; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5104,18 +5160,27 @@ static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ if (EltSz == 64) return false; - unsigned Half = VT.getVectorNumElements() / 2; - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned j = 0; j != 2; ++j) { - unsigned Idx = WhichResult; - for (unsigned i = 0; i != Half; ++i) { - int MIdx = M[i + j * Half]; - if (MIdx >= 0 && (unsigned) MIdx != Idx) - return false; - Idx += 2; + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + unsigned Half = NumElts / 2; + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += Half) { + unsigned Idx = WhichResult; + for (unsigned k = 0; k < Half; ++k) { + int MIdx = M[i + j + k]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) + return false; + Idx += 2; + } } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5123,21 +5188,37 @@ static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return true; } +// Checks whether the shuffle mask represents a vector zip (VZIP) by checking +// that pairs of elements of the shufflemask represent the same index in each +// vector incrementing sequentially through the vectors. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with respect the how results +// are returned. static bool isVZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5154,15 +5235,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll index 72ecf0ef062..61342022ad5 100644 --- a/test/CodeGen/ARM/vext.ll +++ b/test/CodeGen/ARM/vext.ll @@ -196,3 +196,35 @@ define arm_aapcscc void @test_elem_mismatch(<2 x i64>* nocapture %src, <4 x i16> store <4 x i16> %tmp7, <4 x i16>* %dest, align 4 ret void } + +define <4 x i32> @test_reverse_and_extract(<2 x i32>* %A) { +entry: + ; CHECK-LABEL: test_reverse_and_extract + ; CHECK-NOT: vtrn + ; CHECK: vrev + ; CHECK: vext + %tmp1 = load <2 x i32>, <2 x i32>* %A + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + ret <4 x i32> %0 +} + +define <4 x i32> @test_dup_and_extract(<2 x i32>* %A) { +entry: + ; CHECK-LABEL: test_dup_and_extract + ; CHECK-NOT: vtrn + ; CHECK: vdup + ; CHECK: vext + %tmp1 = load <2 x i32>, <2 x i32>* %A + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + ret <4 x i32> %0 +} + +define <4 x i32> @test_zip_and_extract(<2 x i32>* %A) { +entry: + ; CHECK-LABEL: test_zip_and_extract + ; CHECK: vzip + ; CHECK: vext + %tmp1 = load <2 x i32>, <2 x i32>* %A + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + ret <4 x i32> %0 +} diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll index 7b83dfdaf22..40b50499318 100644 --- a/test/CodeGen/ARM/vtrn.ll +++ b/test/CodeGen/ARM/vtrn.ll @@ -325,3 +325,13 @@ define <16 x i16> @vtrnQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> ret <16 x i16> %tmp3 } + +define <8 x i16> @vtrn_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { +entry: + ; CHECK-LABEL: vtrn_lower_shufflemask_undef + ; CHECK: vtrn + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %0 +} diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll index 5510634b066..8e71ad81264 100644 --- a/test/CodeGen/ARM/vuzp.ll +++ b/test/CodeGen/ARM/vuzp.ll @@ -264,3 +264,24 @@ define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> ret <16 x i16> %tmp3 } + +define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { +entry: + ; CHECK-LABEL: vuzp_lower_shufflemask_undef + ; CHECK: vuzp + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %0 +} + +define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { +entry: + ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed + ; CHECK-NOT: vtrn + ; CHECK: vuzp + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> + ret <4 x i32> %0 +} diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll index 1d9f59aeda0..e7b7da333f4 100644 --- a/test/CodeGen/ARM/vzip.ll +++ b/test/CodeGen/ARM/vzip.ll @@ -264,3 +264,34 @@ define <32 x i8> @vzipQi8_undef_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> ret <32 x i8> %tmp3 } + +define <8 x i16> @vzip_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { +entry: + ; CHECK-LABEL: vzip_lower_shufflemask_undef + ; CHECK: vzip + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = load <4 x i16>, <4 x i16>* %B + %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> + ret <8 x i16> %0 +} + +define <4 x i32> @vzip_lower_shufflemask_zeroed(<2 x i32>* %A) { +entry: + ; CHECK-LABEL: vzip_lower_shufflemask_zeroed + ; CHECK-NOT: vtrn + ; CHECK: vzip + %tmp1 = load <2 x i32>, <2 x i32>* %A + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> + ret <4 x i32> %0 +} + +define <4 x i32> @vzip_lower_shufflemask_vuzp(<2 x i32>* %A) { +entry: + ; CHECK-LABEL: vzip_lower_shufflemask_vuzp + ; CHECK-NOT: vuzp + ; CHECK: vzip + %tmp1 = load <2 x i32>, <2 x i32>* %A + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> + ret <4 x i32> %0 +} +