[X86, AVX] replace vextractf128 intrinsics with generic shuffles

author Sanjay Patel <spatel@rotateright.com>

Thu, 12 Mar 2015 15:15:19 +0000 (15:15 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Thu, 12 Mar 2015 15:15:19 +0000 (15:15 +0000)
author Sanjay Patel <spatel@rotateright.com>
Thu, 12 Mar 2015 15:15:19 +0000 (15:15 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Thu, 12 Mar 2015 15:15:19 +0000 (15:15 +0000)
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td

index 998b249b8321ad658acc1a0628f90af93d08c544..b7ffeebfc5f623e8c0488f7397371b6399407f58 100644 (file)
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -1172,19 +1172,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                    llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
  }
  
-// Vector extract and insert
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_vextractf128_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vextractf128_pd256">,
-        Intrinsic<[llvm_v2f64_ty], [llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vextractf128_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vextractf128_ps256">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vextractf128_si_256 :
-        GCCBuiltin<"__builtin_ia32_vextractf128_si256">,
-        Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-}
-
  // Vector convert
  let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
    def int_x86_avx_cvtdq2_pd_256 : GCCBuiltin<"__builtin_ia32_cvtdq2pd256">,
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

index b9e6e44e3532b57c77cac743bb8a21d25a808dac..d4d449a2e0582ee6a2fd6004694529149ebcb3e2 100644 (file)
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4978,9 +4978,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
      setValue(&I, Res);
      return nullptr;
    }
-  case Intrinsic::x86_avx_vextractf128_pd_256:
-  case Intrinsic::x86_avx_vextractf128_ps_256:
-  case Intrinsic::x86_avx_vextractf128_si_256:
    case Intrinsic::x86_avx2_vextracti128: {
      EVT DestVT = TLI.getValueType(I.getType());
      uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp

index f9493bc2a81858e5d6aec8256dbcbe8e1d453cf4..40757053b6c92ece0e6aaa5e0aa427069fc08a64 100644 (file)
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -161,6 +161,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
          Name == "x86.avx.vinsertf128.pd.256" ||
          Name == "x86.avx.vinsertf128.ps.256" ||
          Name == "x86.avx.vinsertf128.si.256" ||
+        Name == "x86.avx.vextractf128.pd.256" ||
+        Name == "x86.avx.vextractf128.ps.256" ||
+        Name == "x86.avx.vextractf128.si.256" ||
          Name == "x86.avx.movnt.dq.256" ||
          Name == "x86.avx.movnt.pd.256" ||
          Name == "x86.avx.movnt.ps.256" ||
@@ -676,6 +679,26 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
          Idxs2.push_back(Builder.getInt32(Idx));
        }
        Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2));
+    } else if (Name == "llvm.x86.avx.vextractf128.pd.256" ||
+               Name == "llvm.x86.avx.vextractf128.ps.256" ||
+               Name == "llvm.x86.avx.vextractf128.si.256") {
+      Value *Op0 = CI->getArgOperand(0);
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned NumElts = VecTy->getNumElements();
+      
+      // Mask off the high bits of the immediate value; hardware ignores those.
+      Imm = Imm & 1;
+
+      // Get indexes for either the high half or low half of the input vector.
+      SmallVector<Constant*, 4> Idxs(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i) {
+        unsigned Idx = Imm ? (i + NumElts) : i;
+        Idxs[i] = Builder.getInt32(Idx);
+      }
+
+      Value *UndefV = UndefValue::get(Op0->getType());
+      Rep = Builder.CreateShuffleVector(Op0, UndefV, ConstantVector::get(Idxs));
      } else {
        bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
        if (Name == "llvm.x86.avx.vpermil.pd.256")
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

index a3908c6be043d1ab3571be66743de5fffbbb91c5..e2f690bff232f9f6d9b1fb15926839b226f13307 100644 (file)
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -36,6 +36,43 @@ define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1
  }
  declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
  
+; We don't check any vextractf128 variant with immediate 0 because that's just a move. 
+
+define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
+; CHECK-LABEL:       test_x86_avx_vextractf128_pd_256_1: 
+; CHECK:             vextractf128 $1, %ymm0, %xmm0
+  %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
+
+define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
+; CHECK-LABEL:       test_x86_avx_vextractf128_ps_256_1: 
+; CHECK:             vextractf128 $1, %ymm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
+
+define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
+; CHECK-LABEL:    test_x86_avx_vextractf128_si_256_1: 
+; CHECK:          vextractf128 $1, %ymm0, %xmm0
+  %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
+
+; Verify that high bits of the immediate are masked off. This should be the equivalent
+; of a vextractf128 $0 which should be optimized away, so just check that it's
+; not a vextractf128 of any kind.
+define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
+; CHECK-LABEL:       test_x86_avx_extractf128_pd_256_2: 
+; CHECK-NOT:         vextractf128
+  %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
+  ret <2 x double> %res
+}
+
+
  define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
  ; CHECK-LABEL:       test_x86_avx_blend_pd_256: 
  ; CHECK:             vblendpd
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll

index 96d80ea7ae62df0f243757c82859b12bc380bbdf..ecf8a19ee6ff76e0c5592114ed416ab015235ee7 100644 (file)
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2163,30 +2163,6 @@ define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
  declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
  
  
-define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) {
-  ; CHECK: vextractf128
-  %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_avx_vextractf128_ps_256(<8 x float> %a0) {
-  ; CHECK: vextractf128
-  %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
-
-
-define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
-  ; CHECK: vextractf128
-  %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 7) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
-
-
  define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
    ; CHECK: vperm2f128
    %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
author	Sanjay Patel <spatel@rotateright.com>
	Thu, 12 Mar 2015 15:15:19 +0000 (15:15 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Thu, 12 Mar 2015 15:15:19 +0000 (15:15 +0000)
include/llvm/IR/IntrinsicsX86.td		patch \| blob \| history
lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp		patch \| blob \| history
lib/IR/AutoUpgrade.cpp		patch \| blob \| history
test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll		patch \| blob \| history
test/CodeGen/X86/avx-intrinsics-x86.ll		patch \| blob \| history