From: Simon Pilgrim Date: Tue, 13 Oct 2015 14:48:54 +0000 (+0000) Subject: [InstCombine][SSE4A] Remove broken INSERTQI range combining optimization X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=70f1fa1bd898b4ad0ef4ba278335dff8ca335b1c;p=oota-llvm.git [InstCombine][SSE4A] Remove broken INSERTQI range combining optimization As discussed in D13348 - the INSERTQI range combining code is wrong in that it confuses the insertion bit index with an extraction bit index. The remaining legal combines are very unlikely (especially once we've converted to shuffles in D13348) so I'm removing the optimization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@250160 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index f741d1d485b..b81b9358aa1 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1059,6 +1059,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (auto CILength = dyn_cast(II->getArgOperand(2))) { if (auto CIIndex = dyn_cast(II->getArgOperand(3))) { unsigned Index = CIIndex->getZExtValue(); + // From AMD documentation: "a value of zero in the field length is // defined as length of 64". unsigned Length = CILength->equalsInt(0) ? 64 : CILength->getZExtValue(); @@ -1077,54 +1078,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Length == 64 && Index == 0) { Value *Vec = II->getArgOperand(1); Value *Undef = UndefValue::get(Vec->getType()); - const uint32_t Mask[] = { 0, 2 }; + const uint32_t Mask[] = {0, 2}; return ReplaceInstUsesWith( CI, Builder->CreateShuffleVector( Vec, Undef, ConstantDataVector::get( II->getContext(), makeArrayRef(Mask)))); - } else if (auto Source = - dyn_cast(II->getArgOperand(0))) { - if (Source->hasOneUse() && - Source->getArgOperand(1) == II->getArgOperand(1)) { - // If the source of the insert has only one use and it's another - // insert (and they're both inserting from the same vector), try to - // bundle both together. - auto CISourceLength = - dyn_cast(Source->getArgOperand(2)); - auto CISourceIndex = - dyn_cast(Source->getArgOperand(3)); - if (CISourceIndex && CISourceLength) { - unsigned SourceIndex = CISourceIndex->getZExtValue(); - unsigned SourceLength = CISourceLength->getZExtValue(); - unsigned SourceEnd = SourceIndex + SourceLength; - unsigned NewIndex, NewLength; - bool ShouldReplace = false; - if (Index <= SourceIndex && SourceIndex <= End) { - NewIndex = Index; - NewLength = std::max(End, SourceEnd) - NewIndex; - ShouldReplace = true; - } else if (SourceIndex <= Index && Index <= SourceEnd) { - NewIndex = SourceIndex; - NewLength = std::max(SourceEnd, End) - NewIndex; - ShouldReplace = true; - } - - if (ShouldReplace) { - Constant *ConstantLength = ConstantInt::get( - II->getArgOperand(2)->getType(), NewLength, false); - Constant *ConstantIndex = ConstantInt::get( - II->getArgOperand(3)->getType(), NewIndex, false); - Value *Args[4] = { Source->getArgOperand(0), - II->getArgOperand(1), ConstantLength, - ConstantIndex }; - Module *M = CI.getParent()->getParent()->getParent(); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); - } - } - } } } } @@ -1220,9 +1179,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // control mask is set, then zero is written in the result byte. // The zero vector is in the right-hand side of the resulting // shufflevector. - + // The value of each index is the least significant 4 bits of the - // shuffle control byte. + // shuffle control byte. Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; } } else if (!isa(V)) diff --git a/test/Transforms/InstCombine/x86-sse4a.ll b/test/Transforms/InstCombine/x86-sse4a.ll index 40404f39b97..92d5e8ad655 100644 --- a/test/Transforms/InstCombine/x86-sse4a.ll +++ b/test/Transforms/InstCombine/x86-sse4a.ll @@ -1,15 +1,5 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s -; We should optimize these two redundant insertqi into one -define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertTwice -; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) - ret <2 x i64> %2 -} - ; The result of this insert is the second arg, since the top 64 bits of ; the result are undefined, and we copy the bottom 64 bits from the ; second arg @@ -20,81 +10,6 @@ define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { ret <2 x i64> %1 } -; Test the several types of ranges and ordering that exist for two insertqi -define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertContainedRange -; CHECK: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) - ret <2 x i64> %2 -} - -define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertContainedRange_2 -; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertOverlappingRange -; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) - ret <2 x i64> %2 -} - -define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertOverlappingRange_2 -; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertAdjacentRange -; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertAdjacentRange_2 -; CHECK-NEXT: %1 = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertDisjointRange -; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK-NEXT: %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) -; CHECK-NEXT: ret <2 x i64> %2 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK-LABEL: @testInsertDisjointRange_2 -; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK-NEXT: %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) -; CHECK-NEXT: ret <2 x i64> %2 - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) { ; CHECK-LABEL: @testZeroLength ; CHECK-NEXT: ret <2 x i64> %i