[X86] Prefer blendps over insertps codegen for one special case

author Sanjay Patel <spatel@rotateright.com>

Fri, 20 Mar 2015 21:19:52 +0000 (21:19 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Fri, 20 Mar 2015 21:19:52 +0000 (21:19 +0000)
author Sanjay Patel <spatel@rotateright.com>
Fri, 20 Mar 2015 21:19:52 +0000 (21:19 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Fri, 20 Mar 2015 21:19:52 +0000 (21:19 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 5ff71da07fcd827afdc644f7aaca0a33aac23a55..6906c6bdd69c461d4b5968499e5af84a0d1e7fd4 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -10550,16 +10550,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
      }
  
      if (EltVT == MVT::f32) {
-      // Bits [7:6] of the constant are the source select.  This will always be
-      //  zero here.  The DAG Combiner may combine an extract_elt index into
-      //  these
-      //  bits.  For example (insert (extract, 3), 2) could be matched by
-      //  putting
-      //  the '3' into bits [7:6] of X86ISD::INSERTPS.
-      // Bits [5:4] of the constant are the destination select.  This is the
-      //  value of the incoming immediate.
-      // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
+      // Bits [7:6] of the constant are the source select. This will always be
+      //   zero here. The DAG Combiner may combine an extract_elt index into
+      //   these bits. For example (insert (extract, 3), 2) could be matched by
+      //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+      // Bits [5:4] of the constant are the destination select. This is the
+      //   value of the incoming immediate.
+      // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
        //   combine either bitwise AND or insert of float 0.0 to set these bits.
+
+      const Function *F = DAG.getMachineFunction().getFunction();
+      bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+      if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+        // If this is an insertion of 32-bits into the low 32-bits of
+        // a vector, we prefer to generate a blend with immediate rather
+        // than an insertps. Blends are simpler operations in hardware and so
+        // will always have equal or better performance than insertps.
+        // But if optimizing for size and there's a load folding opportunity,
+        // generate insertps because blendps does not have a 32-bit memory
+        // operand form.
+        N2 = DAG.getIntPtrConstant(1);
+        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+      }
        N2 = DAG.getIntPtrConstant(IdxVal << 4);
        // Create this as a scalar to vector..
        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll

index f13cd0f5be93524d382637fb49bcc00b03e95758..ca13392ffe8a233ba7d6391c7760922b9ffb0b1b 100644 (file)
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -199,28 +199,51 @@ define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
  
  declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
  
-define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
-; X32-LABEL: insertps_2:
+; When optimizing for speed, prefer blendps over insertps even if it means we have to
+; generate a separate movss to load the scalar operand.
+define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
+; X32-LABEL: blendps_not_insertps_1:
+; X32:       ## BB#0:
+; X32-NEXT:    movss   {{.*#+}} xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: blendps_not_insertps_1:
+; X64:       ## BB#0:
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
+  ret <4 x float> %tmp1
+}
+
+; When optimizing for size, generate an insertps if there's a load fold opportunity.
+; The difference between i386 and x86-64 ABIs for the float operand means we should
+; generate an insertps for X32 but not for X64!
+define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
+; X32-LABEL: insertps_or_blendps:
  ; X32:       ## BB#0:
  ; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  ; X32-NEXT:    retl
  ;
-; X64-LABEL: insertps_2:
+; X64-LABEL: insertps_or_blendps:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
  ; X64-NEXT:    retq
    %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
    ret <4 x float> %tmp1
  }
-define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
-; X32-LABEL: insertps_3:
+
+; An insert into the low 32-bits of a vector from the low 32-bits of another vector
+; is always just a blendps because blendps is never more expensive than insertps.
+define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
+; X32-LABEL: blendps_not_insertps_2:
  ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
  ; X32-NEXT:    retl
  ;
-; X64-LABEL: insertps_3:
+; X64-LABEL: blendps_not_insertps_2:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
  ; X64-NEXT:    retq
    %tmp2 = extractelement <4 x float> %t2, i32 0
    %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
author	Sanjay Patel <spatel@rotateright.com>
	Fri, 20 Mar 2015 21:19:52 +0000 (21:19 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Fri, 20 Mar 2015 21:19:52 +0000 (21:19 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/sse41.ll		patch \| blob \| history