From 5f79fd2f020fe5f389b73ac8b3ea99d461c1985d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 5 Mar 2015 21:46:54 +0000
Subject: [PATCH] [AVX] Lower / fast-isel scalar FP selects into VBLENDV
 instructions (PR22483)

This patch reduces code size for all AVX targets and increases speed for some chips.

SSE 4.1 introduced the useless (see code comments) 2-register form of BLENDV and
only in the packed float/double flavors.

AVX subsequently made the instruction useful by adding a 4-register operand form.

So we just need to paper over the lack of scalar forms of this instruction, complicate
the code to choose float or double forms, and use blendv on scalars since all FP is in
xmm registers anyway.

This gives us an approximately 50% speed up for a blendv microbenchmark sequence
on SandyBridge and Haswell:
blendv : 29.73 cycles/iter
logic : 43.15 cycles/iter

No new test cases with this patch because:

1. fast-isel-select-sse.ll tests the positive side for regular X86 lowering and fast-isel
2. sse-minmax.ll and fp-select-cmp-and.ll confirm that we're not firing for scalar selects without AVX
3. fp-select-cmp-and.ll and logical-load-fold.ll confirm that we're not firing for scalar selects with constants.

http://llvm.org/bugs/show_bug.cgi?id=22483

Differential Revision: http://reviews.llvm.org/D8063


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231408 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FastISel.cpp           | 53 ++++++++-----
 lib/Target/X86/X86ISelLowering.cpp       | 40 +++++++++-
 test/CodeGen/X86/fast-isel-select-sse.ll | 96 ++++++------------------
 3 files changed, 96 insertions(+), 93 deletions(-)

diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 15a4948e843..07b602a1367 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1810,11 +1810,11 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   return true;
 }
 
-/// \brief Emit SSE instructions to lower the select.
+/// \brief Emit SSE or AVX instructions to lower the select.
 ///
 /// Try to use SSE1/SSE2 instructions to simulate a select without branches.
 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
-/// SSE instructions are available.
+/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
 bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
   // Optimize conditions coming from a compare if both instructions are in the
   // same basic block (values defined in other basic blocks may not have
@@ -1850,19 +1850,17 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
   if (NeedSwap)
     std::swap(CmpLHS, CmpRHS);
 
-  static unsigned OpcTable[2][2][4] = {
-    { { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
-      { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr }  },
-    { { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  },
-      { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr }  }
+  // Choose the SSE instruction sequence based on data type (float or double).
+  static unsigned OpcTable[2][4] = {
+    { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
+    { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  }
   };
 
-  bool HasAVX = Subtarget->hasAVX();
   unsigned *Opc = nullptr;
   switch (RetVT.SimpleTy) {
   default: return false;
-  case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
-  case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
+  case MVT::f32: Opc = &OpcTable[0][0]; break;
+  case MVT::f64: Opc = &OpcTable[1][0]; break;
   }
 
   const Value *LHS = I->getOperand(1);
@@ -1884,14 +1882,33 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
     return false;
 
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
-  unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
-                                     CmpRHSReg, CmpRHSIsKill, CC);
-  unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
-                                    LHSReg, LHSIsKill);
-  unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
-                                     RHSReg, RHSIsKill);
-  unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
-                                       AndReg, /*IsKill=*/true);
+  unsigned ResultReg;
+  
+  if (Subtarget->hasAVX()) {
+    // If we have AVX, create 1 blendv instead of 3 logic instructions.
+    // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
+    // uses XMM0 as the selection register. That may need just as many
+    // instructions as the AND/ANDN/OR sequence due to register moves, so
+    // don't bother.
+    unsigned CmpOpcode =
+      (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
+    unsigned BlendOpcode =
+      (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+    
+    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+                                       CmpRHSReg, CmpRHSIsKill, CC);
+    ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill,
+                                 LHSReg, LHSIsKill, CmpReg, true);
+  } else {
+    unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+                                       CmpRHSReg, CmpRHSIsKill, CC);
+    unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+                                      LHSReg, LHSIsKill);
+    unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+                                       RHSReg, RHSIsKill);
+    ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
+                                         AndReg, /*IsKill=*/true);
+  }
   updateValueMap(I, ResultReg);
   return true;
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index dd8a9cee9ad..c558fb3add5 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -13271,9 +13271,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op1.getValueType();
   SDValue CC;
 
-  // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
-  // are available. Otherwise fp cmovs get lowered into a less efficient branch
-  // sequence later on.
+  // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+  // are available or VBLENDV if AVX is available.
+  // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   if (Cond.getOpcode() == ISD::SETCC &&
       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
@@ -13288,8 +13288,42 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                                   DAG.getConstant(SSECC, MVT::i8));
         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
       }
+
       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
                                 DAG.getConstant(SSECC, MVT::i8));
+
+      // If we have AVX, we can use a variable vector select (VBLENDV) instead
+      // of 3 logic instructions for size savings and potentially speed.
+      // Unfortunately, there is no scalar form of VBLENDV.
+      
+      // If either operand is a constant, don't try this. We can expect to
+      // optimize away at least one of the logic instructions later in that
+      // case, so that sequence would be faster than a variable blend.
+      
+      // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+      // uses XMM0 as the selection register. That may need just as many
+      // instructions as the AND/ANDN/OR sequence due to register moves, so
+      // don't bother.
+
+      if (Subtarget->hasAVX() &&
+          !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+        
+        // Convert to vectors, do a VSELECT, and convert back to scalar.
+        // All of the conversions should be optimized away.
+        
+        EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+        SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+        SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+        SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+        EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+        VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
+        
+        SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+        
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                           VSel, DAG.getIntPtrConstant(0));
+      }
       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
diff --git a/test/CodeGen/X86/fast-isel-select-sse.ll b/test/CodeGen/X86/fast-isel-select-sse.ll
index af11a87d551..6761be6d4e1 100644
--- a/test/CodeGen/X86/fast-isel-select-sse.ll
+++ b/test/CodeGen/X86/fast-isel-select-sse.ll
@@ -13,9 +13,7 @@ define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps    %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_oeq_f32
 ; AVX:       vcmpeqss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp oeq float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -29,9 +27,7 @@ define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd    %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_oeq_f64
 ; AVX:       vcmpeqsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp oeq double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -45,9 +41,7 @@ define float @select_fcmp_ogt_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps    %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_ogt_f32
 ; AVX:       vcmpltss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ogt float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -61,9 +55,7 @@ define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd    %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_ogt_f64
 ; AVX:       vcmpltsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ogt double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -77,9 +69,7 @@ define float @select_fcmp_oge_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps    %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_oge_f32
 ; AVX:       vcmpless %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp oge float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -93,9 +83,7 @@ define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd    %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_oge_f64
 ; AVX:       vcmplesd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp oge double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -109,9 +97,7 @@ define float @select_fcmp_olt_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps    %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_olt_f32
 ; AVX:       vcmpltss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp olt float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -125,9 +111,7 @@ define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd    %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_olt_f64
 ; AVX:       vcmpltsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp olt double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -141,9 +125,7 @@ define float @select_fcmp_ole_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps    %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_ole_f32
 ; AVX:       vcmpless %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ole float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -157,9 +139,7 @@ define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd    %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_ole_f64
 ; AVX:       vcmplesd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ole double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -173,9 +153,7 @@ define float @select_fcmp_ord_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_ord_f32
 ; AVX:       vcmpordss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ord float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -189,9 +167,7 @@ define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_ord_f64
 ; AVX:       vcmpordsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ord double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -205,9 +181,7 @@ define float @select_fcmp_uno_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps       %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_uno_f32
 ; AVX:       vcmpunordss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps      %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps     %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps       %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp uno float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -221,9 +195,7 @@ define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd       %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_uno_f64
 ; AVX:       vcmpunordsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd      %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd     %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd       %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp uno double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -237,9 +209,7 @@ define float @select_fcmp_ugt_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_ugt_f32
 ; AVX:       vcmpnless %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ugt float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -253,9 +223,7 @@ define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_ugt_f64
 ; AVX:       vcmpnlesd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ugt double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -269,9 +237,7 @@ define float @select_fcmp_uge_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_uge_f32
 ; AVX:       vcmpnltss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp uge float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -285,9 +251,7 @@ define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_uge_f64
 ; AVX:       vcmpnltsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp uge double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -301,9 +265,7 @@ define float @select_fcmp_ult_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps     %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_ult_f32
 ; AVX:       vcmpnless %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ult float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -317,9 +279,7 @@ define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd     %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_ult_f64
 ; AVX:       vcmpnlesd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ult double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -333,9 +293,7 @@ define float @select_fcmp_ule_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps     %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_ule_f32
 ; AVX:       vcmpnltss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ule float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -349,9 +307,7 @@ define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd     %xmm2, %xmm1
 ; AVX-LABEL: select_fcmp_ule_f64
 ; AVX:       vcmpnltsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp ule double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -365,9 +321,7 @@ define float @select_fcmp_une_f32(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:  orps     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_une_f32
 ; AVX:       vcmpneqss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvps %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp une float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -381,9 +335,7 @@ define double @select_fcmp_une_f64(double %a, double %b, double %c, double %d) {
 ; CHECK-NEXT:  orpd     %xmm2, %xmm0
 ; AVX-LABEL: select_fcmp_une_f64
 ; AVX:       vcmpneqsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
-; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
-; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
   %1 = fcmp une double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
-- 
2.34.1