From: Nadav Rotem Date: Sun, 1 Apr 2012 19:31:22 +0000 (+0000) Subject: This commit contains a few changes that had to go in together. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=4ac9081c7110dbf099d682fa51c091741e763491;p=oota-llvm.git This commit contains a few changes that had to go in together. 1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) (and also scalar_to_vector). 2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src). Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B)) 3. Optimize swizzles of shuffles: shuff(shuff(x, y), undef) -> shuff(x, y). 4. Fix an X86ISelLowering optimization which was very bitcast-sensitive. Code which was previously compiled to this: movd (%rsi), %xmm0 movdqa .LCPI0_0(%rip), %xmm2 pshufb %xmm2, %xmm0 movd (%rdi), %xmm1 pshufb %xmm2, %xmm1 pxor %xmm0, %xmm1 pshufb .LCPI0_1(%rip), %xmm1 movd %xmm1, (%rdi) ret Now compiles to this: movl (%rsi), %eax xorl %eax, (%rdi) ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153848 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bac644a42a8..5e88fcbb0e1 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2336,6 +2336,68 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { ORNode, N0.getOperand(1)); } + // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) + // Only perform this optimization after type legalization and before + // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by + // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and + // we don't want to undo this promotion. + // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper + // on scalars. + if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR) + && Level == AfterLegalizeVectorOps) { + SDValue In0 = N0.getOperand(0); + SDValue In1 = N1.getOperand(0); + EVT In0Ty = In0.getValueType(); + EVT In1Ty = In1.getValueType(); + // If both incoming values are integers, and the original types are the same. + if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) { + SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), In0Ty, In0, In1); + SDValue BC = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, Op); + AddToWorkList(Op.getNode()); + return BC; + } + } + + // Xor/and/or are indifferent to the swizzle operation (shuffle of one value). + // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B)) + // If both shuffles use the same mask, and both shuffle within a single + // vector, then it is worthwhile to move the swizzle after the operation. + // The type-legalizer generates this pattern when loading illegal + // vector types from memory. In many cases this allows additional shuffle + // optimizations. + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { + ShuffleVectorSDNode *SVN0 = cast(N0); + ShuffleVectorSDNode *SVN1 = cast(N1); + SDValue In0 = SVN0->getOperand(0); + SDValue In1 = SVN1->getOperand(0); + EVT In0Ty = In0.getValueType(); + EVT In1Ty = In1.getValueType(); + + unsigned NumElts = VT.getVectorNumElements(); + // Check that both shuffles are swizzles. + bool SingleVecShuff = (N0.getOperand(1).getOpcode() == ISD::UNDEF && + N1.getOperand(1).getOpcode() == ISD::UNDEF); + + // Check that both shuffles use the same mask. The masks are known to be of + // the same length because the result vector type is the same. + bool SameMask = true; + for (unsigned i = 0; i != NumElts; ++i) { + int Idx0 = SVN0->getMaskElt(i); + int Idx1 = SVN1->getMaskElt(i); + if (Idx0 != Idx1) { + SameMask = false; + break; + } + } + + if (SameMask && SingleVecShuff && In0Ty == In1Ty) { + SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT, In0, In1); + SDValue Shuff = DAG.getVectorShuffle(VT, N->getDebugLoc(), Op, + DAG.getUNDEF(VT), &SVN0->getMask()[0]); + AddToWorkList(Op.getNode()); + return Shuff; + } + } return SDValue(); } @@ -7721,6 +7783,36 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return N0; } } + + // If this shuffle node is simply a swizzle of another shuffle node, + // optimize shuffle(shuffle(x, y), undef) -> shuffle(x, y). + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && + N1.getOpcode() == ISD::UNDEF) { + + SmallVector NewMask; + ShuffleVectorSDNode *OtherSV = cast(N0); + + EVT InVT = N0.getValueType(); + int InNumElts = InVT.getVectorNumElements(); + + for (unsigned i = 0; i != NumElts; ++i) { + int Idx = SVN->getMaskElt(i); + // If we access the second (undef) operand then this index can be + // canonicalized to undef as well. + if (Idx >= InNumElts) + Idx = -1; + // Next, this index comes from the first value, which is the incoming + // shuffle. Adopt the incoming index. + if (Idx >= 0) + Idx = OtherSV->getMaskElt(Idx); + + NewMask.push_back(Idx); + } + + return DAG.getVectorShuffle(VT, N->getDebugLoc(), OtherSV->getOperand(0), + OtherSV->getOperand(1), &NewMask[0]); + } + return SDValue(); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 88f38298db0..69a60361314 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14000,13 +14000,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. - if (Mask.getOpcode() != ISD::BITCAST || - X.getOpcode() != ISD::BITCAST || - Y.getOpcode() != ISD::BITCAST) - return SDValue(); - // Look through mask bitcast. - Mask = Mask.getOperand(0); + if (Mask.getOpcode() == ISD::BITCAST) + Mask = Mask.getOperand(0); + if (X.getOpcode() == ISD::BITCAST) + X = X.getOperand(0); + if (Y.getOpcode() == ISD::BITCAST) + Y = Y.getOperand(0); + EVT MaskVT = Mask.getValueType(); // Validate that the Mask operand is a vector sra node. @@ -14027,8 +14028,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. // psign = x.type == y.type == mask.type && y = sub(0, x); - X = X.getOperand(0); - Y = Y.getOperand(0); if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll index 78b4e7ea84c..05794e4ebdd 100644 --- a/test/CodeGen/ARM/reg_sequence.ll +++ b/test/CodeGen/ARM/reg_sequence.ll @@ -273,7 +273,7 @@ define arm_aapcs_vfpcc i32 @t10() nounwind { entry: ; CHECK: t10: ; CHECK: vmov.i32 q[[Q0:[0-9]+]], #0x3f000000 -; CHECK: vmul.f32 q8, q8, d0[0] +; CHECK: vmul.f32 q8, q8, d[[DREG:[0-1]+]] ; CHECK: vadd.f32 q8, q8, q8 %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1] diff --git a/test/CodeGen/CellSPU/rotate_ops.ll b/test/CodeGen/CellSPU/rotate_ops.ll index 97709352760..8b7af20b4a9 100644 --- a/test/CodeGen/CellSPU/rotate_ops.ll +++ b/test/CodeGen/CellSPU/rotate_ops.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=cellspu -o %t1.s -; RUN: grep rot %t1.s | count 86 +; RUN: grep rot %t1.s | count 85 ; RUN: grep roth %t1.s | count 8 ; RUN: grep roti.*5 %t1.s | count 1 ; RUN: grep roti.*27 %t1.s | count 1 diff --git a/test/CodeGen/X86/2011-10-27-tstore.ll b/test/CodeGen/X86/2011-10-27-tstore.ll index 016e02c3d5d..1712f345653 100644 --- a/test/CodeGen/X86/2011-10-27-tstore.ll +++ b/test/CodeGen/X86/2011-10-27-tstore.ll @@ -3,14 +3,14 @@ target triple = "x86_64-unknown-linux-gnu" ;CHECK: ltstore -;CHECK: pshufd -;CHECK: pshufd -;CHECK: ret -define void @ltstore() { +;CHECK: movq +;CHECK-NEXT: movq +;CHECK-NEXT: ret +define void @ltstore(<4 x i32>* %pIn, <2 x i32>* %pOut) { entry: - %in = load <4 x i32>* undef + %in = load <4 x i32>* %pIn %j = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32> - store <2 x i32> %j, <2 x i32>* undef + store <2 x i32> %j, <2 x i32>* %pOut ret void } diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll new file mode 100644 index 00000000000..11b702e3d1b --- /dev/null +++ b/test/CodeGen/X86/SwizzleShuff.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; Check that we perform a scalar XOR on i32. + +; CHECK: pull_bitcast +; CHECK: xorl +; CHECK: ret +define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) { + %A = load <4 x i8>* %pA + %B = load <4 x i8>* %pB + %C = xor <4 x i8> %A, %B + store <4 x i8> %C, <4 x i8>* %pA + ret void +} diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll index 06083989382..65486cb80c9 100644 --- a/test/CodeGen/X86/vec_shuffle-37.ll +++ b/test/CodeGen/X86/vec_shuffle-37.ll @@ -27,11 +27,11 @@ entry: define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline { entry: ; CHECK: t02 -; CHECK: movaps -; CHECK: shufps -; CHECK: pshufd -; CHECK: movq -; CHECK: ret +; CHECK: mov +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: mov +; CHECK-NEXT: ret %0 = bitcast <8 x i32>* %source to <4 x i32>* %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3 %tmp2 = load <4 x i32>* %arrayidx, align 16 diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll index 7bebb274f6e..94200537168 100644 --- a/test/CodeGen/X86/widen_shuffle-1.ll +++ b/test/CodeGen/X86/widen_shuffle-1.ll @@ -33,7 +33,7 @@ entry: define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind { entry: ; CHECK: shuf3: -; CHECK: shufps +; CHECK: shufd %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32>