Use movaps / movd to extract vector element 0 even with sse4.1. It's still cheaper...

author Evan Cheng <evan.cheng@apple.com>

Fri, 2 Jan 2009 05:29:08 +0000 (05:29 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Fri, 2 Jan 2009 05:29:08 +0000 (05:29 +0000)
author Evan Cheng <evan.cheng@apple.com>
Fri, 2 Jan 2009 05:29:08 +0000 (05:29 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Fri, 2 Jan 2009 05:29:08 +0000 (05:29 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 43a7bb5bb5bdfcd8d8693f97cb95faefff4078ff..5ed6342bb1b435cd3fd340a95e5ee89c3abe8574 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4215,6 +4215,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                      DAG.getValueType(VT));
      return DAG.getNode(ISD::TRUNCATE, VT, Assert);
    } else if (VT.getSizeInBits() == 16) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
+                                     DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
+                                                 Op.getOperand(0)),
+                                     Op.getOperand(1)));
      SDValue Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32,
                                      Op.getOperand(0), Op.getOperand(1));
      SDValue Assert  = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll

index b45f9398e4fbb443436a524580c214886569597a..f1f009ec814fde49ed7665f4492a57d2436a2de7 100644 (file)
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2,-sse41 -o %t -f
  ; RUN: grep movss    %t | count 3
  ; RUN: grep movhlps  %t | count 1
  ; RUN: grep pshufd   %t | count 1
diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll

index a6d4f014d29f043d291ccd1d9c9e024b88e9697a..8d0bcc4fbf340897d9e65d56d8a84ebca52dec99 100644 (file)
--- a/test/CodeGen/X86/vec_insert-2.ll
+++ b/test/CodeGen/X86/vec_insert-2.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {\$36,} | count 2
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep shufps | count 2
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pinsrw | count 1
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movhpd | count 1
-; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep unpcklpd | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2,-sse41 | grep {\$36,} | count 2
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2,-sse41 | grep shufps | count 2
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2,-sse41 | grep pinsrw | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2,-sse41 | grep movhpd | count 1
+; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2,-sse41 | grep unpcklpd | count 1
  
  define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
          %tmp1 = insertelement <4 x float> %tmp, float %s, i32 3
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll

index e42a3684899a99966898e3d676e5c0aae8880311..e43eca4b875f53119c44f3d9be8e28521d18c8a5 100644 (file)
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep punpcklqdq | count 1
+; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2,-sse41 | grep punpcklqdq | count 1
  
  define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
          %tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll

index c8c9f141ec1c4eccac5dad564c459babec66f255..e032c5b8549c082702468a35cc71db617ab42692 100644 (file)
--- a/test/CodeGen/X86/vec_insert.ll
+++ b/test/CodeGen/X86/vec_insert.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movss | count 1
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep pinsrw
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2,-sse41 | grep movss | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2,-sse41 | not grep pinsrw
  
  define void @test(<4 x float>* %F, i32 %I) {
         %tmp = load <4 x float>* %F             ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_shuffle-12.ll b/test/CodeGen/X86/vec_shuffle-12.ll

index 3f49b02543bbffb2812d0cb348ddd4f206976b1b..aad27ea2f8a5c7ca9db59c02dbb6e5413d690d52 100644 (file)
--- a/test/CodeGen/X86/vec_shuffle-12.ll
+++ b/test/CodeGen/X86/vec_shuffle-12.ll
@@ -5,24 +5,24 @@
  ; RUN: grep pshuflw %t | count 3
  ; RUN: grep pshufhw %t | count 2
  
-define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) {
+define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
         %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
         ret <8 x i16> %tmp3
  }
  
-define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) {
+define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
         %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
         ret <8 x i16> %tmp
  }
  
-define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) {
+define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
         %tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
         ret <8 x i16> %tmp
  }
  
-define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) {
+define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
         %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
         ret <8 x i16> %tmp
  }
diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll

index e73b824d021f619389e6b8b302419858549eaa85..0c81e77b95e1ecb855ef9ea36be2c815c4827c00 100644 (file)
--- a/test/CodeGen/X86/vec_shuffle-28.ll
+++ b/test/CodeGen/X86/vec_shuffle-28.ll
@@ -1,6 +1,6 @@
  ; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
  ; RUN: grep punpcklwd %t | count 1
-; RUN: grep pextrw %t | count 8
+; RUN: grep pextrw %t | count 6
  ; RUN: grep pinsrw %t | count 8
  
  
diff --git a/test/CodeGen/X86/widen_arith-1.ll b/test/CodeGen/X86/widen_arith-1.ll

index 13683b159f28f627b3075bf19eac56b5a1aca12f..419078174d1a6306fa3a318202ab7e8c5f5ca3b2 100644 (file)
--- a/test/CodeGen/X86/widen_arith-1.ll
+++ b/test/CodeGen/X86/widen_arith-1.ll
@@ -1,7 +1,7 @@
  ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse42 -disable-mmx -o %t -f
  ; RUN: grep paddb  %t | count 1
  ; RUN: grep pextrb %t | count 1
-; RUN: grep pextrw %t | count 1
+; RUN: not grep pextrw %t
  
  ; Widen a v3i8 to v16i8 to use a vector add
author	Evan Cheng <evan.cheng@apple.com>
	Fri, 2 Jan 2009 05:29:08 +0000 (05:29 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Fri, 2 Jan 2009 05:29:08 +0000 (05:29 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vec_extract.ll		patch \| blob \| history
test/CodeGen/X86/vec_insert-2.ll		patch \| blob \| history
test/CodeGen/X86/vec_insert-3.ll		patch \| blob \| history
test/CodeGen/X86/vec_insert.ll		patch \| blob \| history
test/CodeGen/X86/vec_shuffle-12.ll		patch \| blob \| history
test/CodeGen/X86/vec_shuffle-28.ll		patch \| blob \| history
test/CodeGen/X86/widen_arith-1.ll		patch \| blob \| history