From 68db37b952be497c94c7aa98cf26f3baadb5afd3 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 14 Aug 2013 23:24:45 +0000
Subject: [PATCH] R600/SI: Convert v16i8 resource descriptors to i128

Now that compute support is better on SI, we can't continue using v16i8
for descriptors since this is also a legal type in OpenCL.

This patch fixes numerous hangs with the piglit OpenCL test and since
we now use a target specific DAG node for LOAD_CONSTANT with the
correct MemOperandFlags, this should also fix:

https://bugs.freedesktop.org/show_bug.cgi?id=66805

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188429 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/R600/AMDGPU.h                |   1 +
 lib/Target/R600/AMDGPUISelLowering.cpp  |   6 +
 lib/Target/R600/AMDGPUISelLowering.h    |   7 ++
 lib/Target/R600/AMDGPUTargetMachine.cpp |   1 +
 lib/Target/R600/CMakeLists.txt          |   1 +
 lib/Target/R600/SIISelLowering.cpp      |  55 ++++++++-
 lib/Target/R600/SIISelLowering.h        |   3 +
 lib/Target/R600/SIInstrInfo.td          |  20 ++++
 lib/Target/R600/SIInstructions.td       |  76 ++++++------
 lib/Target/R600/SIIntrinsics.td         |   6 +-
 lib/Target/R600/SIRegisterInfo.td       |   2 +-
 lib/Target/R600/SITypeRewriter.cpp      | 146 ++++++++++++++++++++++++
 test/CodeGen/R600/llvm.SI.sample.ll     |  34 +++---
 test/CodeGen/R600/llvm.SI.sampled.ll    |  34 +++---
 14 files changed, 314 insertions(+), 78 deletions(-)
 create mode 100644 lib/Target/R600/SITypeRewriter.cpp
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 6b374cbfff2..e2d1caf8676 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -34,6 +34,7 @@ FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
 FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm);
 
 // SI Passes
+FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index efd27567fe2..9bb487e550c 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -507,5 +507,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
+  NODE_NAME_CASE(LOAD_CONSTANT)
+  NODE_NAME_CASE(LOAD_INPUT)
+  NODE_NAME_CASE(SAMPLE)
+  NODE_NAME_CASE(SAMPLEB)
+  NODE_NAME_CASE(SAMPLED)
+  NODE_NAME_CASE(SAMPLEL)
   }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index f614e23e2ce..5419e71e7f2 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -139,6 +139,13 @@ enum {
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
+  LOAD_INPUT,
+  SAMPLE,
+  SAMPLEB,
+  SAMPLED,
+  SAMPLEL,
+  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LOAD_CONSTANT,
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 5ebc5f27dc4..d77cdddf8b5 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -121,6 +121,7 @@ AMDGPUPassConfig::addPreISel() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
   addPass(createFlattenCFGPass());
   if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createSITypeRewriter());
     addPass(createStructurizeCFGPass());
     addPass(createSIAnnotateControlFlowPass());
   } else {
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index fde187b0d3e..658eeea8148 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -47,6 +47,7 @@ add_llvm_target(R600CodeGen
   SILowerControlFlow.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
+  SITypeRewriter.cpp
   )
 
 add_dependencies(LLVMR600CodeGen AMDGPUCommonTableGen intrinsics_gen)
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 4631f8a3a8b..40f082723af 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -37,7 +37,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass);
   addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass);
 
-  addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
@@ -70,6 +69,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
 
+  setOperationAction(ISD::BITCAST, MVT::i128, Legal);
+
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -82,6 +83,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
 
@@ -415,7 +419,31 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
                                   AMDGPU::VGPR2, VT);
-
+    case AMDGPUIntrinsic::SI_load_const: {
+      SDValue Ops [] = {
+        ResourceDescriptorToi128(Op.getOperand(1), DAG),
+        Op.getOperand(2)
+      };
+
+      MachineMemOperand *MMO = new MachineMemOperand(MachinePointerInfo(),
+                    MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+                    VT.getSizeInBits() / 8, 4);
+      return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                     Op->getVTList(), Ops, 2, VT, MMO);
+    }
+    case AMDGPUIntrinsic::SI_sample:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampleb:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampled:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+    case AMDGPUIntrinsic::SI_samplel:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+    case AMDGPUIntrinsic::SI_vs_load_input:
+      return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                         ResourceDescriptorToi128(Op.getOperand(1), DAG),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
     }
   }
   }
@@ -516,6 +544,29 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
+SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
+                                             SelectionDAG &DAG) const {
+
+  if (Op.getValueType() == MVT::i128) {
+    return Op;
+  }
+
+  assert(Op.getOpcode() == ISD::UNDEF);
+
+  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
+                     DAG.getConstant(0, MVT::i64),
+                     DAG.getConstant(0, MVT::i64));
+}
+
+SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
+                                               const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
+                     Op.getOperand(2),
+                     ResourceDescriptorToi128(Op.getOperand(3), DAG),
+                     Op.getOperand(4));
+}
+
 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index effbf1f85de..321e58c153b 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -23,11 +23,14 @@ namespace llvm {
 class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL,
                          SDValue Chain, unsigned Offset) const;
+  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
+                               SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
   const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index df6d99410d9..b7419782d34 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -16,6 +16,26 @@ def SIadd64bit32bit : SDNode<"ISD::ADD",
   SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
 >;
 
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+                      [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+                       SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVec<1>, SDTCisVT<2, v32i8>,
+                       SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
 // Transformation function, extract the lower 32bit of a 64bit immediate
 def LO32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index d4e0b033702..47042174891 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1303,7 +1303,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset)
 >;
 
@@ -1324,63 +1324,63 @@ def : Pat <
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
-/* int_SI_sample for simple 1D texture lookup */
+/* SIsample for simple 1D texture lookup */
 def : Pat <
-  (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
+  (SIsample v1i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
   (IMAGE_SAMPLE_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleRectPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT),
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
     (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleArrayPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY),
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowPattern<Intrinsic name, MIMG opcode,
+class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-/* int_SI_sample* for texture lookups consuming more address parameters */
+/* SIsample* for texture lookups consuming more address parameters */
 multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
                           MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
 MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
-  def : SamplePattern <int_SI_sample, sample, addr_type>;
-  def : SampleRectPattern <int_SI_sample, sample, addr_type>;
-  def : SampleArrayPattern <int_SI_sample, sample, addr_type>;
-  def : SampleShadowPattern <int_SI_sample, sample_c, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sample, sample_c, addr_type>;
-
-  def : SamplePattern <int_SI_samplel, sample_l, addr_type>;
-  def : SampleArrayPattern <int_SI_samplel, sample_l, addr_type>;
-  def : SampleShadowPattern <int_SI_samplel, sample_c_l, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_samplel, sample_c_l, addr_type>;
-
-  def : SamplePattern <int_SI_sampleb, sample_b, addr_type>;
-  def : SampleArrayPattern <int_SI_sampleb, sample_b, addr_type>;
-  def : SampleShadowPattern <int_SI_sampleb, sample_c_b, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampleb, sample_c_b, addr_type>;
-
-  def : SamplePattern <int_SI_sampled, sample_d, addr_type>;
-  def : SampleArrayPattern <int_SI_sampled, sample_d, addr_type>;
-  def : SampleShadowPattern <int_SI_sampled, sample_c_d, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampled, sample_c_d, addr_type>;
+  def : SamplePattern <SIsample, sample, addr_type>;
+  def : SampleRectPattern <SIsample, sample, addr_type>;
+  def : SampleArrayPattern <SIsample, sample, addr_type>;
+  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+  def : SamplePattern <SIsamplel, sample_l, addr_type>;
+  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <SIsampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <SIsampled, sample_d, addr_type>;
+  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
 }
 
 defm : SamplePatterns<IMAGE_SAMPLE_V2, IMAGE_SAMPLE_C_V2,
@@ -1694,19 +1694,19 @@ def : Pat <
 
 // 1. Offset as 8bit DWORD immediate
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, IMM8bitDWORD:$offset),
+  (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
   (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset)
 >;
 
 // 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, imm:$offset),
+  (SIload_constant i128:$sbase, imm:$offset),
   (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
 >;
 
 // 3. Offset in an 32Bit VGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, i32:$voff),
+  (SIload_constant i128:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff)
 >;
 
@@ -1777,7 +1777,7 @@ defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 2fa073ea099..d6e26adf65b 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -17,10 +17,10 @@ let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
-  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 82d1e719257..0b90772f288 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -157,7 +157,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
   (add SGPR_64, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [i128], 128, (add SGPR_128)>;
 
 def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
new file mode 100644
index 00000000000..9da11e88eb8
--- /dev/null
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -0,0 +1,146 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+///   - v16i8 is used for constant memory resource descriptors.  This type is
+///      legal for some compute APIs, and we don't want to declare it as legal
+///      in the backend, because we want the legalizer to expand all v16i8
+///      operations.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+                       public InstVisitor<SITypeRewriter> {
+
+  static char ID;
+  Module *Mod;
+  Type *v16i8;
+  Type *i128;
+
+public:
+  SITypeRewriter() : FunctionPass(ID) { }
+  virtual bool doInitialization(Module &M);
+  virtual bool runOnFunction(Function &F);
+  virtual const char *getPassName() const {
+    return "SI Type Rewriter";
+  }
+  void visitLoadInst(LoadInst &I);
+  void visitCallInst(CallInst &I);
+  void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+  Mod = &M;
+  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+  i128 = Type::getIntNTy(M.getContext(), 128);
+  return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+  AttributeSet Set = F.getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType");
+
+  unsigned ShaderType = ShaderType::COMPUTE;
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    Str.getAsInteger(0, ShaderType);
+  }
+  if (ShaderType != ShaderType::COMPUTE) {
+    visit(F);
+  }
+
+  visit(F);
+
+  return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  Type *PtrTy = Ptr->getType();
+  Type *ElemTy = PtrTy->getPointerElementType();
+  IRBuilder<> Builder(&I);
+  if (ElemTy == v16i8)  {
+    Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+    LoadInst *Load = Builder.CreateLoad(BitCast);
+    SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
+    I.getAllMetadataOtherThanDebugLoc(MD);
+    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+      Load->setMetadata(MD[i].first, MD[i].second);
+    }
+    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+    I.replaceAllUsesWith(BitCastLoad);
+    I.eraseFromParent();
+  }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+  IRBuilder<> Builder(&I);
+  SmallVector <Value*, 8> Args;
+  SmallVector <Type*, 8> Types;
+  bool NeedToReplace = false;
+  Function *F = I.getCalledFunction();
+  std::string Name = F->getName().str();
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    Value *Arg = I.getArgOperand(i);
+    if (Arg->getType() == v16i8) {
+      Args.push_back(Builder.CreateBitCast(Arg, i128));
+      Types.push_back(i128);
+      NeedToReplace = true;
+      Name = Name + ".i128";
+    } else {
+      Args.push_back(Arg);
+      Types.push_back(Arg->getType());
+    }
+  }
+
+  if (!NeedToReplace) {
+    return;
+  }
+  Function *NewF = Mod->getFunction(Name);
+  if (!NewF) {
+    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+    NewF->setAttributes(F->getAttributes());
+  }
+  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+  I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+  IRBuilder<> Builder(&I);
+  if (I.getDestTy() != i128) {
+    return;
+  }
+
+  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+    if (Op->getSrcTy() == i128) {
+      I.replaceAllUsesWith(Op->getOperand(0));
+      I.eraseFromParent();
+    }
+  }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+  return new SITypeRewriter();
+}
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index dc2948a78f2..1c830a9919c 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
-      <32 x i8> undef, <4 x i32> undef, i32 1)
+      <32 x i8> undef, <16 x i8> undef, i32 1)
    %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
-      <32 x i8> undef, <4 x i32> undef, i32 2)
+      <32 x i8> undef, <16 x i8> undef, i32 2)
    %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
-      <32 x i8> undef, <4 x i32> undef, i32 3)
+      <32 x i8> undef, <16 x i8> undef, i32 3)
    %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
-      <32 x i8> undef, <4 x i32> undef, i32 4)
+      <32 x i8> undef, <16 x i8> undef, i32 4)
    %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
-      <32 x i8> undef, <4 x i32> undef, i32 5)
+      <32 x i8> undef, <16 x i8> undef, i32 5)
    %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
-      <32 x i8> undef, <4 x i32> undef, i32 6)
+      <32 x i8> undef, <16 x i8> undef, i32 6)
    %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
-      <32 x i8> undef, <4 x i32> undef, i32 7)
+      <32 x i8> undef, <16 x i8> undef, i32 7)
    %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
-      <32 x i8> undef, <4 x i32> undef, i32 8)
+      <32 x i8> undef, <16 x i8> undef, i32 8)
    %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
-      <32 x i8> undef, <4 x i32> undef, i32 9)
+      <32 x i8> undef, <16 x i8> undef, i32 9)
    %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
-      <32 x i8> undef, <4 x i32> undef, i32 10)
+      <32 x i8> undef, <16 x i8> undef, i32 10)
    %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
-      <32 x i8> undef, <4 x i32> undef, i32 11)
+      <32 x i8> undef, <16 x i8> undef, i32 11)
    %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
-      <32 x i8> undef, <4 x i32> undef, i32 12)
+      <32 x i8> undef, <16 x i8> undef, i32 12)
    %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
-      <32 x i8> undef, <4 x i32> undef, i32 13)
+      <32 x i8> undef, <16 x i8> undef, i32 13)
    %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
-      <32 x i8> undef, <4 x i32> undef, i32 14)
+      <32 x i8> undef, <16 x i8> undef, i32 14)
    %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
-      <32 x i8> undef, <4 x i32> undef, i32 15)
+      <32 x i8> undef, <16 x i8> undef, i32 15)
    %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
-      <32 x i8> undef, <4 x i32> undef, i32 16)
+      <32 x i8> undef, <16 x i8> undef, i32 16)
    %e1 = extractelement <4 x float> %res1, i32 0
    %e2 = extractelement <4 x float> %res2, i32 1
    %e3 = extractelement <4 x float> %res3, i32 2
@@ -135,6 +135,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <4 x i32>, i32) readnone
+declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
index 56645deaa2a..d43b3789f8b 100644
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
-      <32 x i8> undef, <4 x i32> undef, i32 1)
+      <32 x i8> undef, <16 x i8> undef, i32 1)
    %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
-      <32 x i8> undef, <4 x i32> undef, i32 2)
+      <32 x i8> undef, <16 x i8> undef, i32 2)
    %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
-      <32 x i8> undef, <4 x i32> undef, i32 3)
+      <32 x i8> undef, <16 x i8> undef, i32 3)
    %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
-      <32 x i8> undef, <4 x i32> undef, i32 4)
+      <32 x i8> undef, <16 x i8> undef, i32 4)
    %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
-      <32 x i8> undef, <4 x i32> undef, i32 5)
+      <32 x i8> undef, <16 x i8> undef, i32 5)
    %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
-      <32 x i8> undef, <4 x i32> undef, i32 6)
+      <32 x i8> undef, <16 x i8> undef, i32 6)
    %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
-      <32 x i8> undef, <4 x i32> undef, i32 7)
+      <32 x i8> undef, <16 x i8> undef, i32 7)
    %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
-      <32 x i8> undef, <4 x i32> undef, i32 8)
+      <32 x i8> undef, <16 x i8> undef, i32 8)
    %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
-      <32 x i8> undef, <4 x i32> undef, i32 9)
+      <32 x i8> undef, <16 x i8> undef, i32 9)
    %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
-      <32 x i8> undef, <4 x i32> undef, i32 10)
+      <32 x i8> undef, <16 x i8> undef, i32 10)
    %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
-      <32 x i8> undef, <4 x i32> undef, i32 11)
+      <32 x i8> undef, <16 x i8> undef, i32 11)
    %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
-      <32 x i8> undef, <4 x i32> undef, i32 12)
+      <32 x i8> undef, <16 x i8> undef, i32 12)
    %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
-      <32 x i8> undef, <4 x i32> undef, i32 13)
+      <32 x i8> undef, <16 x i8> undef, i32 13)
    %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
-      <32 x i8> undef, <4 x i32> undef, i32 14)
+      <32 x i8> undef, <16 x i8> undef, i32 14)
    %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
-      <32 x i8> undef, <4 x i32> undef, i32 15)
+      <32 x i8> undef, <16 x i8> undef, i32 15)
    %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
-      <32 x i8> undef, <4 x i32> undef, i32 16)
+      <32 x i8> undef, <16 x i8> undef, i32 16)
    %e1 = extractelement <4 x float> %res1, i32 0
    %e2 = extractelement <4 x float> %res2, i32 1
    %e3 = extractelement <4 x float> %res3, i32 2
@@ -135,6 +135,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <4 x i32>, i32) readnone
+declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-- 
2.34.1