/// the operands which explicitly discard the lanes which are unused by this
/// operation to try to flow through the rest of the combiner the fact that
/// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
+ if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ return SDValue();
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
return SDValue();
- // Only specific types are legal at this point, assert so we notice if and
- // when these change.
- assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
- VT == MVT::v4f64) &&
- "Unknown vector type encountered!");
-
return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
}
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
- if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
- if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
+ if (TLI.isTypeLegal(VT))
+ if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
return AddSub;
// Combine 256-bit vector shuffles. This is only profitable when in AVX mode
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
; Test ADDSUB ISel patterns.
ret <2 x double> %vecinit2
}
+define <16 x float> @test5(<16 x float> %A, <16 x float> %B) {
+; SSE-LABEL: test5:
+; SSE: # BB#0:
+; SSE-NEXT: addsubps %xmm4, %xmm0
+; SSE-NEXT: addsubps %xmm5, %xmm1
+; SSE-NEXT: addsubps %xmm6, %xmm2
+; SSE-NEXT: addsubps %xmm7, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test5:
+; AVX1: # BB#0:
+; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vaddsubps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test5:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,17,2,19,4,21,6,23,8,25,10,27,12,29,14,31]
+; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %add = fadd <16 x float> %A, %B
+ %sub = fsub <16 x float> %A, %B
+ %vecinit2 = shufflevector <16 x float> %sub, <16 x float> %add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+ ret <16 x float> %vecinit2
+}
+
+define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
+; SSE-LABEL: test6:
+; SSE: # BB#0:
+; SSE-NEXT: addsubpd %xmm4, %xmm0
+; SSE-NEXT: addsubpd %xmm5, %xmm1
+; SSE-NEXT: addsubpd %xmm6, %xmm2
+; SSE-NEXT: addsubpd %xmm7, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test6:
+; AVX1: # BB#0:
+; AVX1-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vaddsubpd %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test6:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vsubpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,9,2,11,4,13,6,15]
+; AVX512-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %add = fadd <8 x double> %A, %B
+ %sub = fsub <8 x double> %A, %B
+ %vecinit2 = shufflevector <8 x double> %sub, <8 x double> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+ ret <8 x double> %vecinit2
+}
+
define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {
; SSE-LABEL: test1b:
; SSE: # BB#0: